diff --git "a/dev.ipynb" "b/dev.ipynb" --- "a/dev.ipynb" +++ "b/dev.ipynb" @@ -2,9 +2,18 @@ "cells": [ { "cell_type": "code", - "execution_count": 47, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/andrewreed/Documents/success_projects/closed-vs-open-arena-elo/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "import os\n", "import pickle\n", @@ -22,67 +31,60 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "fs = HfFileSystem()\n", + "from typing import Literal\n", "\n", "\n", - "def extract_date(filename):\n", - " return filename.split(\"/\")[-1].split(\".\")[0].split(\"_\")[-1]\n", + "def download_latest_data_from_space(\n", + " repo_id: str, file_type: Literal[\"pkl\", \"csv\"]\n", + ") -> str:\n", + " \"\"\"\n", + " Downloads the latest data file of the specified file type from the given repository space.\n", "\n", + " Args:\n", + " repo_id (str): The ID of the repository space.\n", + " file_type (Literal[\"pkl\", \"csv\"]): The type of the data file to download. Must be either \"pkl\" or \"csv\".\n", "\n", - "ELO_DATA_FILES = \"spaces/lmsys/chatbot-arena-leaderboard/*.pkl\"\n", - "elo_files = fs.glob(ELO_DATA_FILES)\n", - "latest_elo_file = sorted(elo_files, key=extract_date, reverse=True)[0]\n", + " Returns:\n", + " str: The local file path of the downloaded data file.\n", + " \"\"\"\n", "\n", - "LEADERBOARD_DATA_FILES = \"spaces/lmsys/chatbot-arena-leaderboard/*.csv\"\n", - "leaderboard_files = fs.glob(LEADERBOARD_DATA_FILES)\n", - "latest_leaderboard_file = sorted(leaderboard_files, key=extract_date, reverse=True)[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('leaderboard_table_20240426.csv', 'elo_results_20240426.pkl')" - ] - }, - "execution_count": 73, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "latest_leaderboard_file.split(\"/\")[-1], latest_elo_file.split(\"/\")[-1]" + " def extract_date(filename):\n", + " return filename.split(\"/\")[-1].split(\".\")[0].split(\"_\")[-1]\n", + "\n", + " fs = HfFileSystem()\n", + " data_file_path = f\"spaces/{repo_id}/*.{file_type}\"\n", + " files = fs.glob(data_file_path)\n", + " latest_file = sorted(files, key=extract_date, reverse=True)[0]\n", + "\n", + " latest_filepath_local = hf_hub_download(\n", + " repo_id=repo_id,\n", + " filename=latest_file.split(\"/\")[-1],\n", + " repo_type=\"space\",\n", + " )\n", + " return latest_filepath_local" ] }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "latest_elo_file_local = hf_hub_download(\n", - " repo_id=\"lmsys/chatbot-arena-leaderboard\",\n", - " filename=latest_elo_file.split(\"/\")[-1],\n", - " repo_type=\"space\",\n", + "latest_leaderboard_file_local = download_latest_data_from_space(\n", + " repo_id=\"lmsys/chatbot-arena-leaderboard\", file_type=\"csv\"\n", ")\n", - "latest_leaderboard_file_local = hf_hub_download(\n", - " repo_id=\"lmsys/chatbot-arena-leaderboard\",\n", - " filename=latest_leaderboard_file.split(\"/\")[-1],\n", - " repo_type=\"space\",\n", + "latest_elo_file_local = download_latest_data_from_space(\n", + " repo_id=\"lmsys/chatbot-arena-leaderboard\", file_type=\"pkl\"\n", ")" ] }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -122,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -131,7 +133,7 @@ "dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])" ] }, - "execution_count": 77, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -142,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -177,48 +179,48 @@ " \n", " \n", " RWKV-4-Raven-14B\n", - " 927.710294\n", - " 27.143015\n", - " 935.717850\n", - " 916.546369\n", + " 928.451251\n", + " 26.146415\n", + " 937.017097\n", + " 919.444359\n", " 5129\n", - " 81\n", + " 82\n", " \n", " \n", " alpaca-13b\n", - " 907.324482\n", - " 20.736682\n", - " 915.536856\n", - " 899.330070\n", + " 908.084359\n", + " 18.598539\n", + " 915.348707\n", + " 900.602847\n", " 6111\n", - " 85\n", + " 86\n", " \n", " \n", " bard-jan-24-gemini-pro\n", - " 1208.505408\n", - " 6.679087\n", - " 1213.291358\n", - " 1203.926901\n", - " 12388\n", + " 1208.712877\n", + " 7.975296\n", + " 1213.331583\n", + " 1203.004139\n", + " 12387\n", " 6\n", " \n", " \n", " chatglm-6b\n", - " 886.107553\n", - " 17.110417\n", - " 894.034333\n", - " 878.094776\n", + " 886.873429\n", + " 19.813751\n", + " 894.785321\n", + " 878.677878\n", " 5195\n", - " 86\n", + " 87\n", " \n", " \n", " chatglm2-6b\n", - " 932.678460\n", - " 33.530570\n", - " 943.455598\n", - " 921.346322\n", + " 933.337288\n", + " 33.939472\n", + " 944.493496\n", + " 921.470740\n", " 2880\n", - " 81\n", + " 82\n", " \n", " \n", " ...\n", @@ -231,85 +233,85 @@ " \n", " \n", " wizardlm-70b\n", - " 1107.992552\n", - " 9.385887\n", - " 1114.218223\n", - " 1102.655575\n", - " 8868\n", + " 1108.552744\n", + " 8.988005\n", + " 1114.390689\n", + " 1102.745236\n", + " 8867\n", " 29\n", " \n", " \n", " yi-34b-chat\n", - " 1109.722447\n", - " 8.596908\n", - " 1115.182579\n", - " 1103.991095\n", - " 12252\n", + " 1111.132640\n", + " 7.801741\n", + " 1115.356993\n", + " 1105.658254\n", + " 13177\n", " 29\n", " \n", " \n", " zephyr-7b-alpha\n", - " 1042.108710\n", - " 43.900714\n", - " 1052.991768\n", - " 1027.160917\n", + " 1043.084267\n", + " 45.472021\n", + " 1054.269954\n", + " 1027.602171\n", " 1901\n", - " 58\n", + " 57\n", " \n", " \n", " zephyr-7b-beta\n", - " 1053.655680\n", - " 10.297607\n", - " 1059.923254\n", - " 1047.601629\n", + " 1054.416300\n", + " 11.094606\n", + " 1060.265072\n", + " 1047.790509\n", " 11924\n", - " 54\n", + " 55\n", " \n", " \n", " zephyr-orpo-141b-A35b-v0.1\n", - " 1124.677515\n", - " 22.288515\n", - " 1132.728887\n", - " 1113.848432\n", - " 4276\n", + " 1128.816337\n", + " 16.964385\n", + " 1134.862680\n", + " 1119.183571\n", + " 5207\n", " 22\n", " \n", " \n", "\n", - "

91 rows × 6 columns

\n", + "

92 rows × 6 columns

\n", "" ], "text/plain": [ " rating variance rating_q975 rating_q025 \\\n", - "RWKV-4-Raven-14B 927.710294 27.143015 935.717850 916.546369 \n", - "alpaca-13b 907.324482 20.736682 915.536856 899.330070 \n", - "bard-jan-24-gemini-pro 1208.505408 6.679087 1213.291358 1203.926901 \n", - "chatglm-6b 886.107553 17.110417 894.034333 878.094776 \n", - "chatglm2-6b 932.678460 33.530570 943.455598 921.346322 \n", + "RWKV-4-Raven-14B 928.451251 26.146415 937.017097 919.444359 \n", + "alpaca-13b 908.084359 18.598539 915.348707 900.602847 \n", + "bard-jan-24-gemini-pro 1208.712877 7.975296 1213.331583 1203.004139 \n", + "chatglm-6b 886.873429 19.813751 894.785321 878.677878 \n", + "chatglm2-6b 933.337288 33.939472 944.493496 921.470740 \n", "... ... ... ... ... \n", - "wizardlm-70b 1107.992552 9.385887 1114.218223 1102.655575 \n", - "yi-34b-chat 1109.722447 8.596908 1115.182579 1103.991095 \n", - "zephyr-7b-alpha 1042.108710 43.900714 1052.991768 1027.160917 \n", - "zephyr-7b-beta 1053.655680 10.297607 1059.923254 1047.601629 \n", - "zephyr-orpo-141b-A35b-v0.1 1124.677515 22.288515 1132.728887 1113.848432 \n", + "wizardlm-70b 1108.552744 8.988005 1114.390689 1102.745236 \n", + "yi-34b-chat 1111.132640 7.801741 1115.356993 1105.658254 \n", + "zephyr-7b-alpha 1043.084267 45.472021 1054.269954 1027.602171 \n", + "zephyr-7b-beta 1054.416300 11.094606 1060.265072 1047.790509 \n", + "zephyr-orpo-141b-A35b-v0.1 1128.816337 16.964385 1134.862680 1119.183571 \n", "\n", " num_battles final_ranking \n", - "RWKV-4-Raven-14B 5129 81 \n", - "alpaca-13b 6111 85 \n", - "bard-jan-24-gemini-pro 12388 6 \n", - "chatglm-6b 5195 86 \n", - "chatglm2-6b 2880 81 \n", + "RWKV-4-Raven-14B 5129 82 \n", + "alpaca-13b 6111 86 \n", + "bard-jan-24-gemini-pro 12387 6 \n", + "chatglm-6b 5195 87 \n", + "chatglm2-6b 2880 82 \n", "... ... ... \n", - "wizardlm-70b 8868 29 \n", - "yi-34b-chat 12252 29 \n", - "zephyr-7b-alpha 1901 58 \n", - "zephyr-7b-beta 11924 54 \n", - "zephyr-orpo-141b-A35b-v0.1 4276 22 \n", + "wizardlm-70b 8867 29 \n", + "yi-34b-chat 13177 29 \n", + "zephyr-7b-alpha 1901 57 \n", + "zephyr-7b-beta 11924 55 \n", + "zephyr-orpo-141b-A35b-v0.1 5207 22 \n", "\n", - "[91 rows x 6 columns]" + "[92 rows x 6 columns]" ] }, - "execution_count": 78, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -320,7 +322,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -330,7 +332,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -432,17 +434,6 @@ " ...\n", " \n", " \n", - " 100\n", - " mixtral-8x22b-instruct-v0.1\n", - " Mixtral-8x22b-Instruct-v0.1\n", - " -\n", - " 0.778\n", - " 2024/4\n", - " Apache 2.0\n", - " Mistral\n", - " https://mistral.ai/news/mixtral-8x22b/\n", - " \n", - " \n", " 101\n", " llama-3-70b-instruct\n", " Llama-3-70b-Instruct\n", @@ -486,9 +477,20 @@ " Microsoft\n", " https://azure.microsoft.com/en-us/blog/introdu...\n", " \n", + " \n", + " 105\n", + " snowflake-arctic-instruct\n", + " Snowflake Arctic Instruct\n", + " -\n", + " 0.673\n", + " 2024/4\n", + " Apache 2.0\n", + " Snowflake\n", + " https://www.snowflake.com/blog/arctic-open-eff...\n", + " \n", " \n", "\n", - "

105 rows × 8 columns

\n", + "

106 rows × 8 columns

\n", "" ], "text/plain": [ @@ -499,11 +501,11 @@ "3 tulu-30b Tulu-30B \n", "4 guanaco-65b Guanaco-65B \n", ".. ... ... \n", - "100 mixtral-8x22b-instruct-v0.1 Mixtral-8x22b-Instruct-v0.1 \n", "101 llama-3-70b-instruct Llama-3-70b-Instruct \n", "102 llama-3-8b-instruct Llama-3-8b-Instruct \n", "103 gemini-1.5-pro-api-0409-preview Gemini 1.5 Pro API-0409-Preview \n", "104 phi-3-mini-128k-instruct Phi-3-Mini-128k-Instruct \n", + "105 snowflake-arctic-instruct Snowflake Arctic Instruct \n", "\n", " MT-bench (score) MMLU Knowledge cutoff date License \\\n", "0 7.01 0.587 2023/6 Non-commercial \n", @@ -512,11 +514,11 @@ "3 6.43 0.581 2023/6 Non-commercial \n", "4 6.41 0.621 2023/5 Non-commercial \n", ".. ... ... ... ... \n", - "100 - 0.778 2024/4 Apache 2.0 \n", "101 - 0.820 2023/12 Llama 3 Community \n", "102 - 0.684 2023/3 Llama 3 Community \n", "103 - 0.819 2023/11 Proprietary \n", "104 - 0.681 2023/10 MIT \n", + "105 - 0.673 2024/4 Apache 2.0 \n", "\n", " Organization Link \n", "0 Microsoft https://huggingface.co/WizardLM/WizardLM-30B-V1.0 \n", @@ -525,16 +527,16 @@ "3 AllenAI/UW https://huggingface.co/allenai/tulu-30b \n", "4 UW https://huggingface.co/timdettmers/guanaco-65b... \n", ".. ... ... \n", - "100 Mistral https://mistral.ai/news/mixtral-8x22b/ \n", "101 Meta https://llama.meta.com/llama3/ \n", "102 Meta https://llama.meta.com/llama3/ \n", "103 Google https://blog.google/technology/ai/google-gemin... \n", "104 Microsoft https://azure.microsoft.com/en-us/blog/introdu... \n", + "105 Snowflake https://www.snowflake.com/blog/arctic-open-eff... \n", "\n", - "[105 rows x 8 columns]" + "[106 rows x 8 columns]" ] }, - "execution_count": 80, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -545,7 +547,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -554,7 +556,7 @@ "dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])" ] }, - "execution_count": 82, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -565,7 +567,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -581,7 +583,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -624,11 +626,11 @@ " \n", " \n", " 0\n", - " 1257.399407\n", - " 4.283316\n", - " 1261.676224\n", - " 1254.003626\n", - " 30562\n", + " 1258.815279\n", + " 3.258132\n", + " 1262.796713\n", + " 1256.000508\n", + " 35931\n", " 1\n", " gpt-4-turbo-2024-04-09\n", " GPT-4-Turbo-2024-04-09\n", @@ -641,12 +643,12 @@ " \n", " \n", " 1\n", - " 1253.025095\n", - " 2.069534\n", - " 1256.111392\n", - " 1250.435207\n", - " 69871\n", - " 1\n", + " 1252.684886\n", + " 1.799233\n", + " 1254.748391\n", + " 1249.873417\n", + " 73547\n", + " 2\n", " gpt-4-1106-preview\n", " GPT-4-1106-preview\n", " 9.32\n", @@ -658,11 +660,11 @@ " \n", " \n", " 2\n", - " 1251.114220\n", - " 1.862842\n", - " 1253.629093\n", - " 1248.362042\n", - " 75684\n", + " 1250.926206\n", + " 2.018201\n", + " 1253.851885\n", + " 1248.166034\n", + " 80997\n", " 2\n", " claude-3-opus-20240229\n", " Claude 3 Opus\n", @@ -675,11 +677,11 @@ " \n", " \n", " 3\n", - " 1247.662508\n", - " 3.263747\n", - " 1251.582645\n", - " 1244.380454\n", - " 33723\n", + " 1249.618395\n", + " 3.233129\n", + " 1252.956497\n", + " 1246.247080\n", + " 39482\n", " 2\n", " gemini-1.5-pro-api-0409-preview\n", " Gemini 1.5 Pro API-0409-Preview\n", @@ -692,12 +694,12 @@ " \n", " \n", " 4\n", - " 1247.277052\n", - " 1.923014\n", - " 1249.489411\n", - " 1244.340257\n", - " 61924\n", - " 3\n", + " 1246.777591\n", + " 1.942477\n", + " 1249.979712\n", + " 1244.305362\n", + " 67354\n", + " 2\n", " gpt-4-0125-preview\n", " GPT-4-0125-preview\n", " -\n", @@ -708,308 +710,138 @@ " https://openai.com/blog/new-models-and-develop...\n", " \n", " \n", - " 5\n", - " 1208.505408\n", - " 6.679087\n", - " 1213.291358\n", - " 1203.926901\n", - " 12388\n", - " 6\n", - " bard-jan-24-gemini-pro\n", - " Bard (Gemini Pro)\n", - " -\n", - " -\n", - " Online\n", - " Proprietary\n", - " Google\n", - " https://bard.google.com/\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 6\n", - " 1207.497541\n", - " 4.109466\n", - " 1211.720734\n", - " 1203.322762\n", - " 27298\n", - " 6\n", - " llama-3-70b-instruct\n", - " Llama-3-70b-Instruct\n", - " -\n", - " 0.820\n", - " 2023/12\n", - " Llama 3 Community\n", - " Meta\n", - " https://llama.meta.com/llama3/\n", + " 87\n", + " 886.873429\n", + " 19.813751\n", + " 894.785321\n", + " 878.677878\n", + " 5195\n", + " 87\n", + " chatglm-6b\n", + " ChatGLM-6B\n", + " 4.50\n", + " 0.361\n", + " 2023/3\n", + " Non-commercial\n", + " Tsinghua\n", + " https://huggingface.co/THUDM/chatglm-6b\n", " \n", " \n", - " 7\n", - " 1201.671254\n", - " 2.525563\n", - " 1204.862512\n", - " 1198.658822\n", - " 75418\n", - " 6\n", - " claude-3-sonnet-20240229\n", - " Claude 3 Sonnet\n", - " -\n", - " 0.790\n", - " 2023/8\n", - " Proprietary\n", - " Anthropic\n", - " https://www.anthropic.com/news/claude-3-family\n", + " 88\n", + " 876.929108\n", + " 27.115855\n", + " 887.355529\n", + " 866.860534\n", + " 4521\n", + " 88\n", + " fastchat-t5-3b\n", + " FastChat-T5-3B\n", + " 3.04\n", + " 0.477\n", + " 2023/4\n", + " Apache 2.0\n", + " LMSYS\n", + " https://huggingface.co/lmsys/fastchat-t5-3b-v1.0\n", " \n", " \n", - " 8\n", - " 1191.684542\n", - " 3.459717\n", - " 1195.080256\n", - " 1188.222382\n", - " 41262\n", - " 9\n", - " command-r-plus\n", - " Command R+\n", - " -\n", - " -\n", - " 2024/3\n", - " CC-BY-NC-4.0\n", - " Cohere\n", - " https://txt.cohere.com/command-r-plus-microsof...\n", - " \n", - " \n", - " 9\n", - " 1188.987389\n", - " 3.124792\n", - " 1193.335535\n", - " 1185.935928\n", - " 48390\n", - " 9\n", - " gpt-4-0314\n", - " GPT-4-0314\n", - " 8.96\n", - " 0.864\n", - " 2021/9\n", - " Proprietary\n", - " OpenAI\n", - " https://openai.com/research/gpt-4\n", - " \n", - " \n", - " 10\n", - " 1180.606870\n", - " 3.097542\n", - " 1183.825403\n", - " 1177.255203\n", - " 66065\n", - " 11\n", - " claude-3-haiku-20240307\n", - " Claude 3 Haiku\n", - " -\n", - " 0.752\n", - " 2023/8\n", - " Proprietary\n", - " Anthropic\n", - " https://www.anthropic.com/news/claude-3-family\n", + " 89\n", + " 848.932568\n", + " 36.961459\n", + " 859.103936\n", + " 837.364341\n", + " 3461\n", + " 90\n", + " stablelm-tuned-alpha-7b\n", + " StableLM-Tuned-Alpha-7B\n", + " 2.75\n", + " 0.244\n", + " 2023/4\n", + " CC-BY-NC-SA-4.0\n", + " Stability AI\n", + " https://huggingface.co/stabilityai/stablelm-tu...\n", " \n", " \n", - " 11\n", - " 1164.896561\n", - " 2.585577\n", - " 1167.595696\n", - " 1161.727454\n", - " 67038\n", - " 12\n", - " gpt-4-0613\n", - " GPT-4-0613\n", - " 9.18\n", - " -\n", - " 2021/9\n", - " Proprietary\n", - " OpenAI\n", - " https://platform.openai.com/docs/models/gpt-4-...\n", + " 90\n", + " 826.647332\n", + " 30.156414\n", + " 837.335988\n", + " 816.370788\n", + " 3666\n", + " 91\n", + " dolly-v2-12b\n", + " Dolly-V2-12B\n", + " 3.28\n", + " 0.257\n", + " 2023/4\n", + " MIT\n", + " Databricks\n", + " https://huggingface.co/databricks/dolly-v2-12b\n", " \n", " \n", - " 12\n", - " 1157.638992\n", - " 2.541320\n", - " 1160.496116\n", - " 1154.927748\n", - " 44120\n", - " 13\n", - " mistral-large-2402\n", - " Mistral-Large-2402\n", - " -\n", - " 0.812\n", - " -\n", - " Proprietary\n", - " Mistral\n", - " https://mistral.ai/news/mistral-large/\n", - " \n", - " \n", - " 13\n", - " 1153.464280\n", - " 3.631512\n", - " 1157.068850\n", - " 1150.178903\n", - " 32999\n", - " 13\n", - " qwen1.5-72b-chat\n", - " Qwen1.5-72B-Chat\n", - " 8.61\n", - " 0.775\n", - " 2024/2\n", - " Qianwen LICENSE\n", - " Alibaba\n", - " https://qwenlm.github.io/blog/qwen1.5/\n", - " \n", - " \n", - " 14\n", - " 1150.918473\n", - " 9.062217\n", - " 1155.969721\n", - " 1145.229885\n", - " 8622\n", - " 13\n", - " reka-flash-21b-20240226-online\n", - " Reka-Flash-21B-online\n", - " -\n", - " -\n", - " Online\n", - " Proprietary\n", - " Reka AI\n", - " https://docs.reka.ai/http-api.html#generation\n", - " \n", - " \n", - " 15\n", - " 1150.244313\n", - " 5.551373\n", - " 1154.745214\n", - " 1145.496466\n", - " 21768\n", - " 14\n", - " claude-1\n", - " Claude-1\n", - " 7.90\n", - " 0.770\n", - " -\n", - " Proprietary\n", - " Anthropic\n", - " https://www.anthropic.com/index/introducing-cl...\n", - " \n", - " \n", - " 16\n", - " 1149.267578\n", - " 11.452272\n", - " 1154.290155\n", - " 1141.931621\n", - " 9059\n", - " 14\n", - " reka-flash-21b-20240226\n", - " Reka-Flash-21B\n", - " -\n", - " 0.735\n", - " 2023/11\n", - " Proprietary\n", - " Reka AI\n", - " https://www.reka.ai/news/reka-flash-efficient-...\n", - " \n", - " \n", - " 17\n", - " 1148.072155\n", - " 3.071222\n", - " 1151.980865\n", - " 1144.992044\n", - " 37413\n", - " 14\n", - " command-r\n", - " Command R\n", - " -\n", - " -\n", - " 2024/3\n", - " CC-BY-NC-4.0\n", - " Cohere\n", - " https://txt.cohere.com/command-r\n", - " \n", - " \n", - " 18\n", - " 1147.668325\n", - " 3.542229\n", - " 1150.726489\n", - " 1143.868385\n", - " 32738\n", - " 14\n", - " mistral-medium\n", - " Mistral Medium\n", - " 8.61\n", - " 0.753\n", - " -\n", - " Proprietary\n", - " Mistral\n", - " https://mistral.ai/news/la-plateforme/\n", - " \n", - " \n", - " 19\n", - " 1147.473989\n", - " 5.789710\n", - " 1151.989352\n", - " 1143.322918\n", - " 17214\n", - " 14\n", - " mixtral-8x22b-instruct-v0.1\n", - " Mixtral-8x22b-Instruct-v0.1\n", - " -\n", - " 0.778\n", - " 2024/4\n", - " Apache 2.0\n", - " Mistral\n", - " https://mistral.ai/news/mixtral-8x22b/\n", + " 91\n", + " 804.356329\n", + " 44.756983\n", + " 815.161492\n", + " 790.879536\n", + " 2538\n", + " 92\n", + " llama-13b\n", + " LLaMA-13B\n", + " 2.61\n", + " 0.470\n", + " 2023/2\n", + " Non-commercial\n", + " Meta\n", + " https://arxiv.org/abs/2302.13971\n", " \n", " \n", "\n", + "

92 rows × 14 columns

\n", "" ], "text/plain": [ " rating variance rating_q975 rating_q025 num_battles \\\n", - "0 1257.399407 4.283316 1261.676224 1254.003626 30562 \n", - "1 1253.025095 2.069534 1256.111392 1250.435207 69871 \n", - "2 1251.114220 1.862842 1253.629093 1248.362042 75684 \n", - "3 1247.662508 3.263747 1251.582645 1244.380454 33723 \n", - "4 1247.277052 1.923014 1249.489411 1244.340257 61924 \n", - "5 1208.505408 6.679087 1213.291358 1203.926901 12388 \n", - "6 1207.497541 4.109466 1211.720734 1203.322762 27298 \n", - "7 1201.671254 2.525563 1204.862512 1198.658822 75418 \n", - "8 1191.684542 3.459717 1195.080256 1188.222382 41262 \n", - "9 1188.987389 3.124792 1193.335535 1185.935928 48390 \n", - "10 1180.606870 3.097542 1183.825403 1177.255203 66065 \n", - "11 1164.896561 2.585577 1167.595696 1161.727454 67038 \n", - "12 1157.638992 2.541320 1160.496116 1154.927748 44120 \n", - "13 1153.464280 3.631512 1157.068850 1150.178903 32999 \n", - "14 1150.918473 9.062217 1155.969721 1145.229885 8622 \n", - "15 1150.244313 5.551373 1154.745214 1145.496466 21768 \n", - "16 1149.267578 11.452272 1154.290155 1141.931621 9059 \n", - "17 1148.072155 3.071222 1151.980865 1144.992044 37413 \n", - "18 1147.668325 3.542229 1150.726489 1143.868385 32738 \n", - "19 1147.473989 5.789710 1151.989352 1143.322918 17214 \n", + "0 1258.815279 3.258132 1262.796713 1256.000508 35931 \n", + "1 1252.684886 1.799233 1254.748391 1249.873417 73547 \n", + "2 1250.926206 2.018201 1253.851885 1248.166034 80997 \n", + "3 1249.618395 3.233129 1252.956497 1246.247080 39482 \n", + "4 1246.777591 1.942477 1249.979712 1244.305362 67354 \n", + ".. ... ... ... ... ... \n", + "87 886.873429 19.813751 894.785321 878.677878 5195 \n", + "88 876.929108 27.115855 887.355529 866.860534 4521 \n", + "89 848.932568 36.961459 859.103936 837.364341 3461 \n", + "90 826.647332 30.156414 837.335988 816.370788 3666 \n", + "91 804.356329 44.756983 815.161492 790.879536 2538 \n", "\n", " final_ranking key \\\n", "0 1 gpt-4-turbo-2024-04-09 \n", - "1 1 gpt-4-1106-preview \n", + "1 2 gpt-4-1106-preview \n", "2 2 claude-3-opus-20240229 \n", "3 2 gemini-1.5-pro-api-0409-preview \n", - "4 3 gpt-4-0125-preview \n", - "5 6 bard-jan-24-gemini-pro \n", - "6 6 llama-3-70b-instruct \n", - "7 6 claude-3-sonnet-20240229 \n", - "8 9 command-r-plus \n", - "9 9 gpt-4-0314 \n", - "10 11 claude-3-haiku-20240307 \n", - "11 12 gpt-4-0613 \n", - "12 13 mistral-large-2402 \n", - "13 13 qwen1.5-72b-chat \n", - "14 13 reka-flash-21b-20240226-online \n", - "15 14 claude-1 \n", - "16 14 reka-flash-21b-20240226 \n", - "17 14 command-r \n", - "18 14 mistral-medium \n", - "19 14 mixtral-8x22b-instruct-v0.1 \n", + "4 2 gpt-4-0125-preview \n", + ".. ... ... \n", + "87 87 chatglm-6b \n", + "88 88 fastchat-t5-3b \n", + "89 90 stablelm-tuned-alpha-7b \n", + "90 91 dolly-v2-12b \n", + "91 92 llama-13b \n", "\n", " Model MT-bench (score) MMLU \\\n", "0 GPT-4-Turbo-2024-04-09 - - \n", @@ -1017,43 +849,25 @@ "2 Claude 3 Opus - 0.868 \n", "3 Gemini 1.5 Pro API-0409-Preview - 0.819 \n", "4 GPT-4-0125-preview - - \n", - "5 Bard (Gemini Pro) - - \n", - "6 Llama-3-70b-Instruct - 0.820 \n", - "7 Claude 3 Sonnet - 0.790 \n", - "8 Command R+ - - \n", - "9 GPT-4-0314 8.96 0.864 \n", - "10 Claude 3 Haiku - 0.752 \n", - "11 GPT-4-0613 9.18 - \n", - "12 Mistral-Large-2402 - 0.812 \n", - "13 Qwen1.5-72B-Chat 8.61 0.775 \n", - "14 Reka-Flash-21B-online - - \n", - "15 Claude-1 7.90 0.770 \n", - "16 Reka-Flash-21B - 0.735 \n", - "17 Command R - - \n", - "18 Mistral Medium 8.61 0.753 \n", - "19 Mixtral-8x22b-Instruct-v0.1 - 0.778 \n", + ".. ... ... ... \n", + "87 ChatGLM-6B 4.50 0.361 \n", + "88 FastChat-T5-3B 3.04 0.477 \n", + "89 StableLM-Tuned-Alpha-7B 2.75 0.244 \n", + "90 Dolly-V2-12B 3.28 0.257 \n", + "91 LLaMA-13B 2.61 0.470 \n", "\n", - " Knowledge cutoff date License Organization \\\n", - "0 2023/12 Proprietary OpenAI \n", - "1 2023/4 Proprietary OpenAI \n", - "2 2023/8 Proprietary Anthropic \n", - "3 2023/11 Proprietary Google \n", - "4 2023/12 Proprietary OpenAI \n", - "5 Online Proprietary Google \n", - "6 2023/12 Llama 3 Community Meta \n", - "7 2023/8 Proprietary Anthropic \n", - "8 2024/3 CC-BY-NC-4.0 Cohere \n", - "9 2021/9 Proprietary OpenAI \n", - "10 2023/8 Proprietary Anthropic \n", - "11 2021/9 Proprietary OpenAI \n", - "12 - Proprietary Mistral \n", - "13 2024/2 Qianwen LICENSE Alibaba \n", - "14 Online Proprietary Reka AI \n", - "15 - Proprietary Anthropic \n", - "16 2023/11 Proprietary Reka AI \n", - "17 2024/3 CC-BY-NC-4.0 Cohere \n", - "18 - Proprietary Mistral \n", - "19 2024/4 Apache 2.0 Mistral \n", + " Knowledge cutoff date License Organization \\\n", + "0 2023/12 Proprietary OpenAI \n", + "1 2023/4 Proprietary OpenAI \n", + "2 2023/8 Proprietary Anthropic \n", + "3 2023/11 Proprietary Google \n", + "4 2023/12 Proprietary OpenAI \n", + ".. ... ... ... \n", + "87 2023/3 Non-commercial Tsinghua \n", + "88 2023/4 Apache 2.0 LMSYS \n", + "89 2023/4 CC-BY-NC-SA-4.0 Stability AI \n", + "90 2023/4 MIT Databricks \n", + "91 2023/2 Non-commercial Meta \n", "\n", " Link \n", "0 https://platform.openai.com/docs/models/gpt-4-... \n", @@ -1061,30 +875,23 @@ "2 https://www.anthropic.com/news/claude-3-family \n", "3 https://blog.google/technology/ai/google-gemin... \n", "4 https://openai.com/blog/new-models-and-develop... \n", - "5 https://bard.google.com/ \n", - "6 https://llama.meta.com/llama3/ \n", - "7 https://www.anthropic.com/news/claude-3-family \n", - "8 https://txt.cohere.com/command-r-plus-microsof... \n", - "9 https://openai.com/research/gpt-4 \n", - "10 https://www.anthropic.com/news/claude-3-family \n", - "11 https://platform.openai.com/docs/models/gpt-4-... \n", - "12 https://mistral.ai/news/mistral-large/ \n", - "13 https://qwenlm.github.io/blog/qwen1.5/ \n", - "14 https://docs.reka.ai/http-api.html#generation \n", - "15 https://www.anthropic.com/index/introducing-cl... \n", - "16 https://www.reka.ai/news/reka-flash-efficient-... \n", - "17 https://txt.cohere.com/command-r \n", - "18 https://mistral.ai/news/la-plateforme/ \n", - "19 https://mistral.ai/news/mixtral-8x22b/ " + ".. ... \n", + "87 https://huggingface.co/THUDM/chatglm-6b \n", + "88 https://huggingface.co/lmsys/fastchat-t5-3b-v1.0 \n", + "89 https://huggingface.co/stabilityai/stablelm-tu... \n", + "90 https://huggingface.co/databricks/dolly-v2-12b \n", + "91 https://arxiv.org/abs/2302.13971 \n", + "\n", + "[92 rows x 14 columns]" ] }, - "execution_count": 101, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "merged_dfs[\"Overall\"][:20]" + "merged_dfs[\"Overall\"]" ] }, { @@ -1096,7 +903,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -1106,213 +913,2653 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "t.to_json(\"release_date_mapping.json\", orient=\"records\", lines=True)" + "release_date_mapping = pd.read_json(\"release_date_mapping.json\", orient=\"records\")" ] }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
keyModelRelease Date
0gpt-4-turbo-2024-04-09GPT-4-Turbo-2024-04-092024-04-09
1gpt-4-1106-previewGPT-4-1106-preview2023-11-06
2claude-3-opus-20240229Claude 3 Opus2024-02-29
3gemini-1.5-pro-api-0409-previewGemini 1.5 Pro API-0409-Preview2024-04-09
4gpt-4-0125-previewGPT-4-0125-preview2024-01-25
............
86chatglm-6bChatGLM-6B2023-03-13
87fastchat-t5-3bFastChat-T5-3B2023-04-27
88stablelm-tuned-alpha-7bStableLM-Tuned-Alpha-7B2023-04-19
89dolly-v2-12bDolly-V2-12B2023-04-12
90llama-13bLLaMA-13B2023-02-27
\n", + "

91 rows × 3 columns

\n", + "
" + ], "text/plain": [ - "[{'key': 'gpt-4-turbo-2024-04-09',\n", - " 'Model': 'GPT-4-Turbo-2024-04-09',\n", - " 'Release Date': ''},\n", - " {'key': 'gpt-4-1106-preview',\n", - " 'Model': 'GPT-4-1106-preview',\n", - " 'Release Date': ''},\n", - " {'key': 'claude-3-opus-20240229',\n", - " 'Model': 'Claude 3 Opus',\n", - " 'Release Date': ''},\n", - " {'key': 'gemini-1.5-pro-api-0409-preview',\n", - " 'Model': 'Gemini 1.5 Pro API-0409-Preview',\n", - " 'Release Date': ''},\n", - " {'key': 'gpt-4-0125-preview',\n", - " 'Model': 'GPT-4-0125-preview',\n", - " 'Release Date': ''},\n", - " {'key': 'bard-jan-24-gemini-pro',\n", - " 'Model': 'Bard (Gemini Pro)',\n", - " 'Release Date': ''},\n", - " {'key': 'llama-3-70b-instruct',\n", - " 'Model': 'Llama-3-70b-Instruct',\n", - " 'Release Date': ''},\n", - " {'key': 'claude-3-sonnet-20240229',\n", - " 'Model': 'Claude 3 Sonnet',\n", - " 'Release Date': ''},\n", - " {'key': 'command-r-plus', 'Model': 'Command R+', 'Release Date': ''},\n", - " {'key': 'gpt-4-0314', 'Model': 'GPT-4-0314', 'Release Date': ''},\n", - " {'key': 'claude-3-haiku-20240307',\n", - " 'Model': 'Claude 3 Haiku',\n", - " 'Release Date': ''},\n", - " {'key': 'gpt-4-0613', 'Model': 'GPT-4-0613', 'Release Date': ''},\n", - " {'key': 'mistral-large-2402',\n", - " 'Model': 'Mistral-Large-2402',\n", - " 'Release Date': ''},\n", - " {'key': 'qwen1.5-72b-chat', 'Model': 'Qwen1.5-72B-Chat', 'Release Date': ''},\n", - " {'key': 'reka-flash-21b-20240226-online',\n", - " 'Model': 'Reka-Flash-21B-online',\n", - " 'Release Date': ''},\n", - " {'key': 'claude-1', 'Model': 'Claude-1', 'Release Date': ''},\n", - " {'key': 'reka-flash-21b-20240226',\n", - " 'Model': 'Reka-Flash-21B',\n", - " 'Release Date': ''},\n", - " {'key': 'command-r', 'Model': 'Command R', 'Release Date': ''},\n", - " {'key': 'mistral-medium', 'Model': 'Mistral Medium', 'Release Date': ''},\n", - " {'key': 'mixtral-8x22b-instruct-v0.1',\n", - " 'Model': 'Mixtral-8x22b-Instruct-v0.1',\n", - " 'Release Date': ''},\n", - " {'key': 'llama-3-8b-instruct',\n", - " 'Model': 'Llama-3-8b-Instruct',\n", - " 'Release Date': ''},\n", - " {'key': 'gemini-pro-dev-api',\n", - " 'Model': 'Gemini Pro (Dev API)',\n", - " 'Release Date': ''},\n", - " {'key': 'qwen1.5-32b-chat', 'Model': 'Qwen1.5-32B-Chat', 'Release Date': ''},\n", - " {'key': 'claude-2.0', 'Model': 'Claude-2.0', 'Release Date': ''},\n", - " {'key': 'mistral-next', 'Model': 'Mistral-Next', 'Release Date': ''},\n", - " {'key': 'zephyr-orpo-141b-A35b-v0.1',\n", - " 'Model': 'Zephyr-ORPO-141b-A35b-v0.1',\n", - " 'Release Date': ''},\n", - " {'key': 'gpt-3.5-turbo-0613',\n", - " 'Model': 'GPT-3.5-Turbo-0613',\n", - " 'Release Date': ''},\n", - " {'key': 'claude-2.1', 'Model': 'Claude-2.1', 'Release Date': ''},\n", - " {'key': 'qwen1.5-14b-chat', 'Model': 'Qwen1.5-14B-Chat', 'Release Date': ''},\n", - " {'key': 'starling-lm-7b-beta',\n", - " 'Model': 'Starling-LM-7B-beta',\n", - " 'Release Date': ''},\n", - " {'key': 'gemini-pro', 'Model': 'Gemini Pro', 'Release Date': ''},\n", - " {'key': 'mixtral-8x7b-instruct-v0.1',\n", - " 'Model': 'Mixtral-8x7b-Instruct-v0.1',\n", - " 'Release Date': ''},\n", - " {'key': 'claude-instant-1', 'Model': 'Claude-Instant-1', 'Release Date': ''},\n", - " {'key': 'yi-34b-chat', 'Model': 'Yi-34B-Chat', 'Release Date': ''},\n", - " {'key': 'gpt-3.5-turbo-0314',\n", - " 'Model': 'GPT-3.5-Turbo-0314',\n", - " 'Release Date': ''},\n", - " {'key': 'wizardlm-70b', 'Model': 'WizardLM-70B-v1.0', 'Release Date': ''},\n", - " {'key': 'gpt-3.5-turbo-0125',\n", - " 'Model': 'GPT-3.5-Turbo-0125',\n", - " 'Release Date': ''},\n", - " {'key': 'tulu-2-dpo-70b', 'Model': 'Tulu-2-DPO-70B', 'Release Date': ''},\n", - " {'key': 'dbrx-instruct-preview',\n", - " 'Model': 'DBRX-Instruct-Preview',\n", - " 'Release Date': ''},\n", - " {'key': 'openchat-3.5-0106',\n", - " 'Model': 'OpenChat-3.5-0106',\n", - " 'Release Date': ''},\n", - " {'key': 'vicuna-33b', 'Model': 'Vicuna-33B', 'Release Date': ''},\n", - " {'key': 'starling-lm-7b-alpha',\n", - " 'Model': 'Starling-LM-7B-alpha',\n", - " 'Release Date': ''},\n", - " {'key': 'llama-2-70b-chat', 'Model': 'Llama-2-70b-chat', 'Release Date': ''},\n", - " {'key': 'nous-hermes-2-mixtral-8x7b-dpo',\n", - " 'Model': 'Nous-Hermes-2-Mixtral-8x7B-DPO',\n", - " 'Release Date': ''},\n", - " {'key': 'gemma-1.1-7b-it', 'Model': 'Gemma-1.1-7B-it', 'Release Date': ''},\n", - " {'key': 'llama2-70b-steerlm-chat',\n", - " 'Model': 'NV-Llama2-70B-SteerLM-Chat',\n", - " 'Release Date': ''},\n", - " {'key': 'deepseek-llm-67b-chat',\n", - " 'Model': 'DeepSeek-LLM-67B-Chat',\n", - " 'Release Date': ''},\n", - " {'key': 'openhermes-2.5-mistral-7b',\n", - " 'Model': 'OpenHermes-2.5-Mistral-7b',\n", - " 'Release Date': ''},\n", - " {'key': 'openchat-3.5', 'Model': 'OpenChat-3.5', 'Release Date': ''},\n", - " {'key': 'pplx-70b-online', 'Model': 'pplx-70b-online', 'Release Date': ''},\n", - " {'key': 'mistral-7b-instruct-v0.2',\n", - " 'Model': 'Mistral-7B-Instruct-v0.2',\n", - " 'Release Date': ''},\n", - " {'key': 'qwen1.5-7b-chat', 'Model': 'Qwen1.5-7B-Chat', 'Release Date': ''},\n", - " {'key': 'gpt-3.5-turbo-1106',\n", - " 'Model': 'GPT-3.5-Turbo-1106',\n", - " 'Release Date': ''},\n", - " {'key': 'dolphin-2.2.1-mistral-7b',\n", - " 'Model': 'Dolphin-2.2.1-Mistral-7B',\n", - " 'Release Date': ''},\n", - " {'key': 'solar-10.7b-instruct-v1.0',\n", - " 'Model': 'SOLAR-10.7B-Instruct-v1.0',\n", - " 'Release Date': ''},\n", - " {'key': 'phi-3-mini-128k-instruct',\n", - " 'Model': 'Phi-3-Mini-128k-Instruct',\n", - " 'Release Date': ''},\n", - " {'key': 'wizardlm-13b', 'Model': 'WizardLM-13b-v1.2', 'Release Date': ''},\n", - " {'key': 'llama-2-13b-chat', 'Model': 'Llama-2-13b-chat', 'Release Date': ''},\n", - " {'key': 'zephyr-7b-beta', 'Model': 'Zephyr-7b-beta', 'Release Date': ''},\n", - " {'key': 'codellama-70b-instruct',\n", - " 'Model': 'CodeLlama-70B-instruct',\n", - " 'Release Date': ''},\n", - " {'key': 'mpt-30b-chat', 'Model': 'MPT-30B-chat', 'Release Date': ''},\n", - " {'key': 'vicuna-13b', 'Model': 'Vicuna-13B', 'Release Date': ''},\n", - " {'key': 'codellama-34b-instruct',\n", - " 'Model': 'CodeLlama-34B-instruct',\n", - " 'Release Date': ''},\n", - " {'key': 'gemma-7b-it', 'Model': 'Gemma-7B-it', 'Release Date': ''},\n", - " {'key': 'pplx-7b-online', 'Model': 'pplx-7b-online', 'Release Date': ''},\n", - " {'key': 'zephyr-7b-alpha', 'Model': 'Zephyr-7b-alpha', 'Release Date': ''},\n", - " {'key': 'llama-2-7b-chat', 'Model': 'Llama-2-7b-chat', 'Release Date': ''},\n", - " {'key': 'qwen-14b-chat', 'Model': 'Qwen-14B-Chat', 'Release Date': ''},\n", - " {'key': 'falcon-180b-chat', 'Model': 'falcon-180b-chat', 'Release Date': ''},\n", - " {'key': 'guanaco-33b', 'Model': 'Guanaco-33B', 'Release Date': ''},\n", - " {'key': 'stripedhyena-nous-7b',\n", - " 'Model': 'StripedHyena-Nous-7B',\n", - " 'Release Date': ''},\n", - " {'key': 'olmo-7b-instruct', 'Model': 'OLMo-7B-instruct', 'Release Date': ''},\n", - " {'key': 'gemma-1.1-2b-it', 'Model': 'Gemma-1.1-2B-it', 'Release Date': ''},\n", - " {'key': 'mistral-7b-instruct',\n", - " 'Model': 'Mistral-7B-Instruct-v0.1',\n", - " 'Release Date': ''},\n", - " {'key': 'palm-2', 'Model': 'PaLM-Chat-Bison-001', 'Release Date': ''},\n", - " {'key': 'vicuna-7b', 'Model': 'Vicuna-7B', 'Release Date': ''},\n", - " {'key': 'qwen1.5-4b-chat', 'Model': 'Qwen1.5-4B-Chat', 'Release Date': ''},\n", - " {'key': 'gemma-2b-it', 'Model': 'Gemma-2B-it', 'Release Date': ''},\n", - " {'key': 'koala-13b', 'Model': 'Koala-13B', 'Release Date': ''},\n", - " {'key': 'chatglm3-6b', 'Model': 'ChatGLM3-6B', 'Release Date': ''},\n", - " {'key': 'gpt4all-13b-snoozy',\n", - " 'Model': 'GPT4All-13B-Snoozy',\n", - " 'Release Date': ''},\n", - " {'key': 'chatglm2-6b', 'Model': 'ChatGLM2-6B', 'Release Date': ''},\n", - " {'key': 'mpt-7b-chat', 'Model': 'MPT-7B-Chat', 'Release Date': ''},\n", - " {'key': 'RWKV-4-Raven-14B', 'Model': 'RWKV-4-Raven-14B', 'Release Date': ''},\n", - " {'key': 'alpaca-13b', 'Model': 'Alpaca-13B', 'Release Date': ''},\n", - " {'key': 'oasst-pythia-12b',\n", - " 'Model': 'OpenAssistant-Pythia-12B',\n", - " 'Release Date': ''},\n", - " {'key': 'chatglm-6b', 'Model': 'ChatGLM-6B', 'Release Date': ''},\n", - " {'key': 'fastchat-t5-3b', 'Model': 'FastChat-T5-3B', 'Release Date': ''},\n", - " {'key': 'stablelm-tuned-alpha-7b',\n", - " 'Model': 'StableLM-Tuned-Alpha-7B',\n", - " 'Release Date': ''},\n", - " {'key': 'dolly-v2-12b', 'Model': 'Dolly-V2-12B', 'Release Date': ''},\n", - " {'key': 'llama-13b', 'Model': 'LLaMA-13B', 'Release Date': ''}]" + " key Model \\\n", + "0 gpt-4-turbo-2024-04-09 GPT-4-Turbo-2024-04-09 \n", + "1 gpt-4-1106-preview GPT-4-1106-preview \n", + "2 claude-3-opus-20240229 Claude 3 Opus \n", + "3 gemini-1.5-pro-api-0409-preview Gemini 1.5 Pro API-0409-Preview \n", + "4 gpt-4-0125-preview GPT-4-0125-preview \n", + ".. ... ... \n", + "86 chatglm-6b ChatGLM-6B \n", + "87 fastchat-t5-3b FastChat-T5-3B \n", + "88 stablelm-tuned-alpha-7b StableLM-Tuned-Alpha-7B \n", + "89 dolly-v2-12b Dolly-V2-12B \n", + "90 llama-13b LLaMA-13B \n", + "\n", + " Release Date \n", + "0 2024-04-09 \n", + "1 2023-11-06 \n", + "2 2024-02-29 \n", + "3 2024-04-09 \n", + "4 2024-01-25 \n", + ".. ... \n", + "86 2023-03-13 \n", + "87 2023-04-27 \n", + "88 2023-04-19 \n", + "89 2023-04-12 \n", + "90 2023-02-27 \n", + "\n", + "[91 rows x 3 columns]" ] }, - "execution_count": 119, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "t.to_dict(orient=\"records\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Build plot" + "release_date_mapping" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 15, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
keyRelease Date
0gpt-4-turbo-2024-04-092024-04-09
1gpt-4-1106-preview2023-11-06
2claude-3-opus-202402292024-02-29
3gemini-1.5-pro-api-0409-preview2024-04-09
4gpt-4-0125-preview2024-01-25
.........
86chatglm-6b2023-03-13
87fastchat-t5-3b2023-04-27
88stablelm-tuned-alpha-7b2023-04-19
89dolly-v2-12b2023-04-12
90llama-13b2023-02-27
\n", + "

91 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " key Release Date\n", + "0 gpt-4-turbo-2024-04-09 2024-04-09\n", + "1 gpt-4-1106-preview 2023-11-06\n", + "2 claude-3-opus-20240229 2024-02-29\n", + "3 gemini-1.5-pro-api-0409-preview 2024-04-09\n", + "4 gpt-4-0125-preview 2024-01-25\n", + ".. ... ...\n", + "86 chatglm-6b 2023-03-13\n", + "87 fastchat-t5-3b 2023-04-27\n", + "88 stablelm-tuned-alpha-7b 2023-04-19\n", + "89 dolly-v2-12b 2023-04-12\n", + "90 llama-13b 2023-02-27\n", + "\n", + "[91 rows x 2 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "release_date_mapping[[\"key\", \"Release Date\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# add release dates into the merged data\n", + "for k, v in merged_dfs.items():\n", + " merged_dfs[k] = pd.merge(\n", + " merged_dfs[k], release_date_mapping[[\"key\", \"Release Date\"]], on=\"key\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['rating', 'variance', 'rating_q975', 'rating_q025', 'num_battles',\n", + " 'final_ranking', 'key', 'Model', 'MT-bench (score)', 'MMLU',\n", + " 'Knowledge cutoff date', 'License', 'Organization', 'Link',\n", + " 'Release Date'],\n", + " dtype='object')" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_dfs[\"Overall\"].columns" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "def format_data(df):\n", + " df[\"License\"] = df[\"License\"].apply(\n", + " lambda x: \"Proprietary LLM\" if x in PROPRIETARY_LICENSES else \"Open LLM\"\n", + " )\n", + " df[\"Release Date\"] = pd.to_datetime(df[\"Release Date\"])\n", + " df[\"Month-Year\"] = df[\"Release Date\"].dt.to_period(\"M\")\n", + " df[\"rating\"] = df[\"rating\"].round()\n", + " return df.reset_index(drop=True)\n", + "\n", + "\n", + "merged_dfs2 = {k: format_data(v) for k, v in merged_dfs.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5\n", + "5\n", + "5\n", + "5\n", + "5\n", + "5\n", + "5\n", + "5\n", + "5\n" + ] + } + ], + "source": [ + "for k, df in merged_dfs2.items():\n", + " print(\n", + " int(\n", + " df.groupby([\"Release Date\", \"License\"])[\"rating\"]\n", + " .apply(lambda x: len(x))\n", + " .max()\n", + " )\n", + " )\n", + " (df[\"rating\"].min().round(),)\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build plot" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "t = {\n", + " \"Overall\": {\n", + " \"min_elo_score\": 804.0,\n", + " \"max_elo_score\": 1259.0,\n", + " \"upper_models_per_month\": 5,\n", + " },\n", + " \"Coding\": {\n", + " \"min_elo_score\": 672.0,\n", + " \"max_elo_score\": 1270.0,\n", + " \"upper_models_per_month\": 5,\n", + " },\n", + " \"Longer Query\": {\n", + " \"min_elo_score\": 796.0,\n", + " \"max_elo_score\": 1273.0,\n", + " \"upper_models_per_month\": 5,\n", + " },\n", + " \"English\": {\n", + " \"min_elo_score\": 783.0,\n", + " \"max_elo_score\": 1246.0,\n", + " \"upper_models_per_month\": 5,\n", + " },\n", + " \"Chinese\": {\n", + " \"min_elo_score\": 753.0,\n", + " \"max_elo_score\": 1325.0,\n", + " \"upper_models_per_month\": 5,\n", + " },\n", + " \"French\": {\n", + " \"min_elo_score\": 694.0,\n", + " \"max_elo_score\": 1268.0,\n", + " \"upper_models_per_month\": 5,\n", + " },\n", + " \"Exclude Ties\": {\n", + " \"min_elo_score\": 654.0,\n", + " \"max_elo_score\": 1334.0,\n", + " \"upper_models_per_month\": 5,\n", + " },\n", + " \"Exclude Short Query (< 5 tokens)\": {\n", + " \"min_elo_score\": 796.0,\n", + " \"max_elo_score\": 1264.0,\n", + " \"upper_models_per_month\": 5,\n", + " },\n", + " \"Exclude Refusal\": {\n", + " \"min_elo_score\": 795.0,\n", + " \"max_elo_score\": 1264.0,\n", + " \"upper_models_per_month\": 5,\n", + " },\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "o = {\n", + " \"min_elo_score\": ,\n", + " \"max_elo_score\": ,\n", + " \"upper_models_per_month\": ,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "PROPRIETARY_LICENSES = [\n", + " \"Proprietary\",\n", + " \"Non-commercial\",\n", + "]\n", + "\n", + "df = merged_dfs[\"Overall\"]\n", + "df[\"License\"] = df[\"License\"].apply(\n", + " lambda x: \"Proprietary LLM\" if x in PROPRIETARY_LICENSES else \"Open LLM\"\n", + ")\n", + "df[\"Release Date\"] = pd.to_datetime(df[\"Release Date\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"Month-Year\"] = df[\"Release Date\"].dt.to_period(\"M\")" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby([\"Month-Year\", \"License\"])[\"rating\"].apply(lambda x: x.count()).max()" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ratingvariancerating_q975rating_q025num_battlesfinal_rankingkeyModelMT-bench (score)MMLUKnowledge cutoff dateLicenseOrganizationLinkRelease Datelicense_binaryMonth-Year
41246.7775911.9424771249.9797121244.305362673542gpt-4-0125-previewGPT-4-0125-preview--2023/12Proprietary LLMOpenAIhttps://openai.com/blog/new-models-and-develop...2024-01-25Proprietary LLM2024-01
321111.1326407.8017411115.3569931105.6582541317729yi-34b-chatYi-34B-Chat-0.7352023/6Open LLM01 AIhttps://huggingface.co/01-ai/Yi-34B-Chat2024-01-23Open LLM2024-01
361107.1298102.4191821110.0561881104.0025814722032gpt-3.5-turbo-0125GPT-3.5-Turbo-0125--2021/9Proprietary LLMOpenAIhttps://platform.openai.com/docs/models/gpt-3-...2024-01-25Proprietary LLM2024-01
391098.5274556.4001661103.3435921093.9036951415936openchat-3.5-0106OpenChat-3.5-01067.80.6582024/1Open LLMOpenChathttps://huggingface.co/openchat/openchat-3.5-01062024-01-06Open LLM2024-01
431087.30775818.3142581094.5325981078.413814398040nous-hermes-2-mixtral-8x7b-dpoNous-Hermes-2-Mixtral-8x7B-DPO--2024/1Open LLMNousResearchhttps://huggingface.co/NousResearch/Nous-Herme...2024-01-13Open LLM2024-01
601047.92768860.7072251061.9521161034.283514132155codellama-70b-instructCodeLlama-70B-instruct--2024/1Open LLMMetahttps://huggingface.co/codellama/CodeLlama-70b-hf2024-01-29Open LLM2024-01
\n", + "
" + ], + "text/plain": [ + " rating variance rating_q975 rating_q025 num_battles \\\n", + "4 1246.777591 1.942477 1249.979712 1244.305362 67354 \n", + "32 1111.132640 7.801741 1115.356993 1105.658254 13177 \n", + "36 1107.129810 2.419182 1110.056188 1104.002581 47220 \n", + "39 1098.527455 6.400166 1103.343592 1093.903695 14159 \n", + "43 1087.307758 18.314258 1094.532598 1078.413814 3980 \n", + "60 1047.927688 60.707225 1061.952116 1034.283514 1321 \n", + "\n", + " final_ranking key \\\n", + "4 2 gpt-4-0125-preview \n", + "32 29 yi-34b-chat \n", + "36 32 gpt-3.5-turbo-0125 \n", + "39 36 openchat-3.5-0106 \n", + "43 40 nous-hermes-2-mixtral-8x7b-dpo \n", + "60 55 codellama-70b-instruct \n", + "\n", + " Model MT-bench (score) MMLU \\\n", + "4 GPT-4-0125-preview - - \n", + "32 Yi-34B-Chat - 0.735 \n", + "36 GPT-3.5-Turbo-0125 - - \n", + "39 OpenChat-3.5-0106 7.8 0.658 \n", + "43 Nous-Hermes-2-Mixtral-8x7B-DPO - - \n", + "60 CodeLlama-70B-instruct - - \n", + "\n", + " Knowledge cutoff date License Organization \\\n", + "4 2023/12 Proprietary LLM OpenAI \n", + "32 2023/6 Open LLM 01 AI \n", + "36 2021/9 Proprietary LLM OpenAI \n", + "39 2024/1 Open LLM OpenChat \n", + "43 2024/1 Open LLM NousResearch \n", + "60 2024/1 Open LLM Meta \n", + "\n", + " Link Release Date \\\n", + "4 https://openai.com/blog/new-models-and-develop... 2024-01-25 \n", + "32 https://huggingface.co/01-ai/Yi-34B-Chat 2024-01-23 \n", + "36 https://platform.openai.com/docs/models/gpt-3-... 2024-01-25 \n", + "39 https://huggingface.co/openchat/openchat-3.5-0106 2024-01-06 \n", + "43 https://huggingface.co/NousResearch/Nous-Herme... 2024-01-13 \n", + "60 https://huggingface.co/codellama/CodeLlama-70b-hf 2024-01-29 \n", + "\n", + " license_binary Month-Year \n", + "4 Proprietary LLM 2024-01 \n", + "32 Open LLM 2024-01 \n", + "36 Proprietary LLM 2024-01 \n", + "39 Open LLM 2024-01 \n", + "43 Open LLM 2024-01 \n", + "60 Open LLM 2024-01 " + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df[\"Month-Year\"] == \"2024-01\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/w0/6t9rxkj97rv47l9sc0q22yth0000gn/T/ipykernel_7726/1725500526.py:1: DeprecationWarning:\n", + "\n", + "DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ratingvariancerating_q975rating_q025num_battlesfinal_rankingkeyModelMT-bench (score)MMLUKnowledge cutoff dateLicenseOrganizationLinkRelease Datelicense_binaryMonth-Year
01111.1326407.8017411115.3569931105.6582541317729yi-34b-chatYi-34B-Chat-0.7352023/6Open LLM01 AIhttps://huggingface.co/01-ai/Yi-34B-Chat2024-01-23Open LLM2024-01
11098.5274556.4001661103.3435921093.9036951415936openchat-3.5-0106OpenChat-3.5-01067.80.6582024/1Open LLMOpenChathttps://huggingface.co/openchat/openchat-3.5-01062024-01-06Open LLM2024-01
21087.30775818.3142581094.5325981078.413814398040nous-hermes-2-mixtral-8x7b-dpoNous-Hermes-2-Mixtral-8x7B-DPO--2024/1Open LLMNousResearchhttps://huggingface.co/NousResearch/Nous-Herme...2024-01-13Open LLM2024-01
31246.7775911.9424771249.9797121244.305362673542gpt-4-0125-previewGPT-4-0125-preview--2023/12Proprietary LLMOpenAIhttps://openai.com/blog/new-models-and-develop...2024-01-25Proprietary LLM2024-01
41107.1298102.4191821110.0561881104.0025814722032gpt-3.5-turbo-0125GPT-3.5-Turbo-0125--2021/9Proprietary LLMOpenAIhttps://platform.openai.com/docs/models/gpt-3-...2024-01-25Proprietary LLM2024-01
\n", + "
" + ], + "text/plain": [ + " rating variance rating_q975 rating_q025 num_battles \\\n", + "0 1111.132640 7.801741 1115.356993 1105.658254 13177 \n", + "1 1098.527455 6.400166 1103.343592 1093.903695 14159 \n", + "2 1087.307758 18.314258 1094.532598 1078.413814 3980 \n", + "3 1246.777591 1.942477 1249.979712 1244.305362 67354 \n", + "4 1107.129810 2.419182 1110.056188 1104.002581 47220 \n", + "\n", + " final_ranking key \\\n", + "0 29 yi-34b-chat \n", + "1 36 openchat-3.5-0106 \n", + "2 40 nous-hermes-2-mixtral-8x7b-dpo \n", + "3 2 gpt-4-0125-preview \n", + "4 32 gpt-3.5-turbo-0125 \n", + "\n", + " Model MT-bench (score) MMLU \\\n", + "0 Yi-34B-Chat - 0.735 \n", + "1 OpenChat-3.5-0106 7.8 0.658 \n", + "2 Nous-Hermes-2-Mixtral-8x7B-DPO - - \n", + "3 GPT-4-0125-preview - - \n", + "4 GPT-3.5-Turbo-0125 - - \n", + "\n", + " Knowledge cutoff date License Organization \\\n", + "0 2023/6 Open LLM 01 AI \n", + "1 2024/1 Open LLM OpenChat \n", + "2 2024/1 Open LLM NousResearch \n", + "3 2023/12 Proprietary LLM OpenAI \n", + "4 2021/9 Proprietary LLM OpenAI \n", + "\n", + " Link Release Date \\\n", + "0 https://huggingface.co/01-ai/Yi-34B-Chat 2024-01-23 \n", + "1 https://huggingface.co/openchat/openchat-3.5-0106 2024-01-06 \n", + "2 https://huggingface.co/NousResearch/Nous-Herme... 2024-01-13 \n", + "3 https://openai.com/blog/new-models-and-develop... 2024-01-25 \n", + "4 https://platform.openai.com/docs/models/gpt-3-... 2024-01-25 \n", + "\n", + " license_binary Month-Year \n", + "0 Open LLM 2024-01 \n", + "1 Open LLM 2024-01 \n", + "2 Open LLM 2024-01 \n", + "3 Proprietary LLM 2024-01 \n", + "4 Proprietary LLM 2024-01 " + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df[\"Month-Year\"] == \"2024-01\"].groupby([\"Month-Year\", \"License\"]).apply(\n", + " lambda x: x.nlargest(3, \"rating\")\n", + ").reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['rating', 'variance', 'rating_q975', 'rating_q025', 'num_battles',\n", + " 'final_ranking', 'key', 'Model', 'MT-bench (score)', 'MMLU',\n", + " 'Knowledge cutoff date', 'License', 'Organization', 'Link',\n", + " 'Release Date', 'license_binary'],\n", + " dtype='object')" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "plotlyServerURL": "https://plot.ly" + }, + "data": [ + { + "customdata": [ + [ + "OpenAI", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "OpenAI", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Anthropic", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Google", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "OpenAI", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Google", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Anthropic", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "OpenAI", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Anthropic", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "OpenAI", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Mistral", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Reka AI", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Anthropic", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Mistral", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Reka AI", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Google", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Anthropic", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Mistral", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "OpenAI", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Anthropic", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Google", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Anthropic", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "OpenAI", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "OpenAI", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "LMSYS", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Perplexity AI", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "OpenAI", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Perplexity AI", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "UW", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Google", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "UC Berkeley", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Nomic AI", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Stanford", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Tsinghua", + "Proprietary LLM", + "Proprietary LLM" + ], + [ + "Meta", + "Proprietary LLM", + "Proprietary LLM" + ] + ], + "hovertemplate": "%{hovertext}

license_binary=%{customdata[2]}
Release Date=%{x}
Arena ELO=%{y}
Organization=%{customdata[0]}
License=%{customdata[1]}", + "hovertext": [ + "GPT-4-Turbo-2024-04-09", + "GPT-4-1106-preview", + "Claude 3 Opus", + "Gemini 1.5 Pro API-0409-Preview", + "GPT-4-0125-preview", + "Bard (Gemini Pro)", + "Claude 3 Sonnet", + "GPT-4-0314", + "Claude 3 Haiku", + "GPT-4-0613", + "Mistral-Large-2402", + "Reka-Flash-21B-online", + "Claude-1", + "Mistral Medium", + "Reka-Flash-21B", + "Gemini Pro (Dev API)", + "Claude-2.0", + "Mistral-Next", + "GPT-3.5-Turbo-0613", + "Claude-2.1", + "Gemini Pro", + "Claude-Instant-1", + "GPT-3.5-Turbo-0314", + "GPT-3.5-Turbo-0125", + "Vicuna-33B", + "pplx-70b-online", + "GPT-3.5-Turbo-1106", + "pplx-7b-online", + "Guanaco-33B", + "PaLM-Chat-Bison-001", + "Koala-13B", + "GPT4All-13B-Snoozy", + "Alpaca-13B", + "ChatGLM-6B", + "LLaMA-13B" + ], + "legendgroup": "Proprietary LLM", + "marker": { + "color": "#636efa", + "size": 8, + "symbol": "circle" + }, + "mode": "markers", + "name": "Proprietary LLM", + "orientation": "v", + "showlegend": true, + "type": "scatter", + "x": [ + "2024-04-09T00:00:00", + "2023-11-06T00:00:00", + "2024-02-29T00:00:00", + "2024-04-09T00:00:00", + "2024-01-25T00:00:00", + "2024-02-01T00:00:00", + "2024-02-29T00:00:00", + "2024-03-14T00:00:00", + "2024-03-07T00:00:00", + "2023-06-13T00:00:00", + "2024-02-24T00:00:00", + "2024-02-26T00:00:00", + "2023-03-14T00:00:00", + "2023-12-11T00:00:00", + "2024-02-26T00:00:00", + "2023-12-13T00:00:00", + "2023-07-11T00:00:00", + "2024-02-17T00:00:00", + "2023-06-13T00:00:00", + "2023-11-21T00:00:00", + "2023-12-13T00:00:00", + "2023-03-14T00:00:00", + "2024-03-14T00:00:00", + "2024-01-25T00:00:00", + "2023-06-21T00:00:00", + "2023-11-29T00:00:00", + "2023-11-06T00:00:00", + "2023-11-29T00:00:00", + "2023-05-22T00:00:00", + "2023-07-10T00:00:00", + "2023-04-03T00:00:00", + "2023-04-24T00:00:00", + "2023-03-13T00:00:00", + "2023-03-13T00:00:00", + "2023-02-27T00:00:00" + ], + "xaxis": "x", + "y": [ + 1258.8152791324715, + 1252.6848856241577, + 1250.9262064295565, + 1249.6183945401244, + 1246.7775913509702, + 1208.7128773784577, + 1201.2654981955752, + 1189.557977031121, + 1180.8870022256567, + 1165.279013874706, + 1157.2129636222178, + 1153.368015144387, + 1150.6246111849628, + 1148.003325470259, + 1147.136619289767, + 1135.7254379948201, + 1132.3083987521873, + 1126.6887059695398, + 1119.8996424050451, + 1119.0708879096221, + 1115.3213731540973, + 1110.3806845414053, + 1108.9125926100855, + 1107.1298100300314, + 1093.8870113925889, + 1075.4285458870645, + 1072.711340370162, + 1043.3909111518306, + 1034.3952377983876, + 1009.7116452193085, + 969.48148016344, + 938.8924300511185, + 908.0843590844727, + 886.8734292498528, + 804.3563285706291 + ], + "yaxis": "y" + }, + { + "customdata": [ + [ + "Meta", + "Open LLM", + "Open LLM" + ], + [ + "Cohere", + "Open LLM", + "Open LLM" + ], + [ + "Meta", + "Open LLM", + "Open LLM" + ], + [ + "Alibaba", + "Open LLM", + "Open LLM" + ], + [ + "Cohere", + "Open LLM", + "Open LLM" + ], + [ + "Mistral", + "Open LLM", + "Open LLM" + ], + [ + "Alibaba", + "Open LLM", + "Open LLM" + ], + [ + "HuggingFace", + "Open LLM", + "Open LLM" + ], + [ + "Nexusflow", + "Open LLM", + "Open LLM" + ], + [ + "Alibaba", + "Open LLM", + "Open LLM" + ], + [ + "Mistral", + "Open LLM", + "Open LLM" + ], + [ + "01 AI", + "Open LLM", + "Open LLM" + ], + [ + "Microsoft", + "Open LLM", + "Open LLM" + ], + [ + "Databricks", + "Open LLM", + "Open LLM" + ], + [ + "AllenAI/UW", + "Open LLM", + "Open LLM" + ], + [ + "OpenChat", + "Open LLM", + "Open LLM" + ], + [ + "UC Berkeley", + "Open LLM", + "Open LLM" + ], + [ + "Meta", + "Open LLM", + "Open LLM" + ], + [ + "NousResearch", + "Open LLM", + "Open LLM" + ], + [ + "Google", + "Open LLM", + "Open LLM" + ], + [ + "Nvidia", + "Open LLM", + "Open LLM" + ], + [ + "DeepSeek AI", + "Open LLM", + "Open LLM" + ], + [ + "OpenChat", + "Open LLM", + "Open LLM" + ], + [ + "NousResearch", + "Open LLM", + "Open LLM" + ], + [ + "Alibaba", + "Open LLM", + "Open LLM" + ], + [ + "Mistral", + "Open LLM", + "Open LLM" + ], + [ + "Cognitive Computations", + "Open LLM", + "Open LLM" + ], + [ + "Upstage AI", + "Open LLM", + "Open LLM" + ], + [ + "Microsoft", + "Open LLM", + "Open LLM" + ], + [ + "Meta", + "Open LLM", + "Open LLM" + ], + [ + "HuggingFace", + "Open LLM", + "Open LLM" + ], + [ + "Microsoft", + "Open LLM", + "Open LLM" + ], + [ + "LMSYS", + "Open LLM", + "Open LLM" + ], + [ + "Meta", + "Open LLM", + "Open LLM" + ], + [ + "MosaicML", + "Open LLM", + "Open LLM" + ], + [ + "Meta", + "Open LLM", + "Open LLM" + ], + [ + "Google", + "Open LLM", + "Open LLM" + ], + [ + "HuggingFace", + "Open LLM", + "Open LLM" + ], + [ + "Meta", + "Open LLM", + "Open LLM" + ], + [ + "Alibaba", + "Open LLM", + "Open LLM" + ], + [ + "TII", + "Open LLM", + "Open LLM" + ], + [ + "Together AI", + "Open LLM", + "Open LLM" + ], + [ + "Allen AI", + "Open LLM", + "Open LLM" + ], + [ + "Google", + "Open LLM", + "Open LLM" + ], + [ + "Mistral", + "Open LLM", + "Open LLM" + ], + [ + "LMSYS", + "Open LLM", + "Open LLM" + ], + [ + "Alibaba", + "Open LLM", + "Open LLM" + ], + [ + "Google", + "Open LLM", + "Open LLM" + ], + [ + "Tsinghua", + "Open LLM", + "Open LLM" + ], + [ + "MosaicML", + "Open LLM", + "Open LLM" + ], + [ + "Tsinghua", + "Open LLM", + "Open LLM" + ], + [ + "RWKV", + "Open LLM", + "Open LLM" + ], + [ + "OpenAssistant", + "Open LLM", + "Open LLM" + ], + [ + "LMSYS", + "Open LLM", + "Open LLM" + ], + [ + "Stability AI", + "Open LLM", + "Open LLM" + ], + [ + "Databricks", + "Open LLM", + "Open LLM" + ] + ], + "hovertemplate": "%{hovertext}

license_binary=%{customdata[2]}
Release Date=%{x}
Arena ELO=%{y}
Organization=%{customdata[0]}
License=%{customdata[1]}", + "hovertext": [ + "Llama-3-70b-Instruct", + "Command R+", + "Llama-3-8b-Instruct", + "Qwen1.5-72B-Chat", + "Command R", + "Mixtral-8x22b-Instruct-v0.1", + "Qwen1.5-32B-Chat", + "Zephyr-ORPO-141b-A35b-v0.1", + "Starling-LM-7B-beta", + "Qwen1.5-14B-Chat", + "Mixtral-8x7b-Instruct-v0.1", + "Yi-34B-Chat", + "WizardLM-70B-v1.0", + "DBRX-Instruct-Preview", + "Tulu-2-DPO-70B", + "OpenChat-3.5-0106", + "Starling-LM-7B-alpha", + "Llama-2-70b-chat", + "Nous-Hermes-2-Mixtral-8x7B-DPO", + "Gemma-1.1-7B-it", + "NV-Llama2-70B-SteerLM-Chat", + "DeepSeek-LLM-67B-Chat", + "OpenChat-3.5", + "OpenHermes-2.5-Mistral-7b", + "Qwen1.5-7B-Chat", + "Mistral-7B-Instruct-v0.2", + "Dolphin-2.2.1-Mistral-7B", + "SOLAR-10.7B-Instruct-v1.0", + "WizardLM-13b-v1.2", + "Llama-2-13b-chat", + "Zephyr-7b-beta", + "Phi-3-Mini-128k-Instruct", + "Vicuna-13B", + "CodeLlama-70B-instruct", + "MPT-30B-chat", + "CodeLlama-34B-instruct", + "Gemma-7B-it", + "Zephyr-7b-alpha", + "Llama-2-7b-chat", + "Qwen-14B-Chat", + "falcon-180b-chat", + "StripedHyena-Nous-7B", + "OLMo-7B-instruct", + "Gemma-1.1-2B-it", + "Mistral-7B-Instruct-v0.1", + "Vicuna-7B", + "Qwen1.5-4B-Chat", + "Gemma-2B-it", + "ChatGLM3-6B", + "MPT-7B-Chat", + "ChatGLM2-6B", + "RWKV-4-Raven-14B", + "OpenAssistant-Pythia-12B", + "FastChat-T5-3B", + "StableLM-Tuned-Alpha-7B", + "Dolly-V2-12B" + ], + "legendgroup": "Open LLM", + "marker": { + "color": "#EF553B", + "size": 8, + "symbol": "circle" + }, + "mode": "markers", + "name": "Open LLM", + "orientation": "v", + "showlegend": true, + "type": "scatter", + "x": [ + "2024-04-18T00:00:00", + "2024-04-04T00:00:00", + "2024-04-18T00:00:00", + "2024-02-04T00:00:00", + "2024-03-11T00:00:00", + "2024-04-17T00:00:00", + "2024-02-04T00:00:00", + "2024-04-12T00:00:00", + "2024-03-20T00:00:00", + "2024-02-04T00:00:00", + "2023-12-11T00:00:00", + "2024-01-23T00:00:00", + "2023-08-09T00:00:00", + "2024-03-27T00:00:00", + "2023-11-12T00:00:00", + "2024-01-06T00:00:00", + "2023-11-25T00:00:00", + "2023-07-18T00:00:00", + "2024-01-13T00:00:00", + "2024-04-09T00:00:00", + "2023-11-24T00:00:00", + "2023-11-29T00:00:00", + "2023-11-16T00:00:00", + "2023-10-29T00:00:00", + "2024-02-04T00:00:00", + "2023-12-11T00:00:00", + "2023-10-30T00:00:00", + "2023-12-13T00:00:00", + "2023-07-25T00:00:00", + "2023-07-18T00:00:00", + "2023-10-26T00:00:00", + "2024-04-23T00:00:00", + "2023-07-23T00:00:00", + "2024-01-29T00:00:00", + "2023-06-09T00:00:00", + "2023-08-24T00:00:00", + "2024-02-21T00:00:00", + "2023-10-09T00:00:00", + "2023-07-18T00:00:00", + "2023-09-24T00:00:00", + "2023-09-05T00:00:00", + "2023-12-07T00:00:00", + "2024-02-23T00:00:00", + "2024-04-09T00:00:00", + "2023-09-27T00:00:00", + "2023-07-29T00:00:00", + "2024-02-04T00:00:00", + "2024-02-21T00:00:00", + "2023-10-25T00:00:00", + "2023-05-04T00:00:00", + "2023-06-25T00:00:00", + "2023-05-22T00:00:00", + "2023-04-03T00:00:00", + "2023-04-27T00:00:00", + "2023-04-19T00:00:00", + "2023-04-12T00:00:00" + ], + "xaxis": "x", + "y": [ + 1209.6462958943152, + 1190.5291640364956, + 1152.500938092916, + 1152.485612667822, + 1147.8966494489798, + 1145.8123271934626, + 1133.8011394014864, + 1128.8163366984966, + 1118.5178781177128, + 1118.475700517794, + 1114, + 1111.1326399460543, + 1108.552744333791, + 1103.2167069462541, + 1102.79428840509, + 1098.527455141752, + 1091.5210240331344, + 1088.7078065720734, + 1087.307757938674, + 1082.9619916739105, + 1082.4713591517852, + 1079.7362777221456, + 1078.6663284631356, + 1078.6429577216027, + 1076.5321247427814, + 1074.0655548845186, + 1065.574858796917, + 1065.0611191304033, + 1061.9003873957429, + 1056.9265912995625, + 1054.4162995844372, + 1050.1481252382014, + 1047.9555279582555, + 1047.927687897156, + 1047.823066613369, + 1047.396876459045, + 1043.5443043467913, + 1043.0842673002462, + 1040.7537596503887, + 1038.586932982431, + 1037.076380506833, + 1023.112092466059, + 1020.7569311460566, + 1014.832737666584, + 1012.1048679697501, + 1009.3834445358582, + 1002.744713564041, + 999.6431193544297, + 960.7895509564338, + 933.340871331175, + 933.3372880828122, + 928.4512512366093, + 900.2948677134343, + 876.9291083582452, + 848.9325675003323, + 826.6473317994165 + ], + "yaxis": "y" + } + ], + "layout": { + "legend": { + "title": { + "text": "license_binary" + }, + "tracegroupgap": 0 + }, + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "white", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "white", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "#C8D4E3", + "linecolor": "#C8D4E3", + "minorgridcolor": "#C8D4E3", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "#C8D4E3", + "linecolor": "#C8D4E3", + "minorgridcolor": "#C8D4E3", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "fillpattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "white", + "showlakes": true, + "showland": true, + "subunitcolor": "#C8D4E3" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "white", + "polar": { + "angularaxis": { + "gridcolor": "#EBF0F8", + "linecolor": "#EBF0F8", + "ticks": "" + }, + "bgcolor": "white", + "radialaxis": { + "gridcolor": "#EBF0F8", + "linecolor": "#EBF0F8", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "white", + "gridcolor": "#DFE8F3", + "gridwidth": 2, + "linecolor": "#EBF0F8", + "showbackground": true, + "ticks": "", + "zerolinecolor": "#EBF0F8" + }, + "yaxis": { + "backgroundcolor": "white", + "gridcolor": "#DFE8F3", + "gridwidth": 2, + "linecolor": "#EBF0F8", + "showbackground": true, + "ticks": "", + "zerolinecolor": "#EBF0F8" + }, + "zaxis": { + "backgroundcolor": "white", + "gridcolor": "#DFE8F3", + "gridwidth": 2, + "linecolor": "#EBF0F8", + "showbackground": true, + "ticks": "", + "zerolinecolor": "#EBF0F8" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "#DFE8F3", + "linecolor": "#A2B1C6", + "ticks": "" + }, + "baxis": { + "gridcolor": "#DFE8F3", + "linecolor": "#A2B1C6", + "ticks": "" + }, + "bgcolor": "white", + "caxis": { + "gridcolor": "#DFE8F3", + "linecolor": "#A2B1C6", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "#EBF0F8", + "linecolor": "#EBF0F8", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "#EBF0F8", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "#EBF0F8", + "linecolor": "#EBF0F8", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "#EBF0F8", + "zerolinewidth": 2 + } + } + }, + "title": { + "text": "Closed-source vs. Open-weight models (Arena ELO, 19 Apr 24)" + }, + "xaxis": { + "anchor": "y", + "domain": [ + 0, + 1 + ], + "title": { + "text": "Release Date" + } + }, + "yaxis": { + "anchor": "x", + "domain": [ + 0, + 1 + ], + "title": { + "text": "Arena ELO" + } + } + } + } + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import plotly.express as px\n", + "import plotly.graph_objects as go\n", + "\n", + "# Plotting\n", + "fig = px.scatter(\n", + " df,\n", + " x=\"Release Date\",\n", + " y=\"rating\",\n", + " color=\"license_binary\",\n", + " hover_name=\"Model\",\n", + " hover_data=[\n", + " \"Release Date\",\n", + " \"Organization\",\n", + " \"License\",\n", + " \"license_binary\",\n", + " ],\n", + " title=\"Closed-source vs. Open-weight models (Arena ELO, 19 Apr 24)\",\n", + " labels={\"rating\": \"Arena ELO\", \"Release Date\": \"Release Date\"},\n", + " template=\"plotly_white\",\n", + ")\n", + "fig.update_traces(marker=dict(size=8))\n", + "\n", + "# Display the plot\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "plotly.graph_objs._figure.Figure" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(fig)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [] } ],