Judge_Model,full_name,Realization,kt_with_elo,beta_fit,bias_std,ci_low,ci_high URM-LLaMa-3.1-8B,URM-LLaMa-3.1-8B_BT,Reward,0.8188194038573934,1.8368566117313365,0.08485779417852375,, Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#bad-good_textual-score_BT,Likert,0.8173049165314519,4.755366194422462,0.07924632786346093,, Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#0-100_verbalized-score_BT,Numeric,0.8141437755698421,4.0878126854835,0.07925204683413378,, mistral-large-instruct-2407,mistral-large-instruct-2407#bad-good_textual-score_BT,Likert,0.8106370543541787,5.471086170786369,0.08571761438483068,, gpt-4o-2024-11-20,gpt-4o-2024-11-20#comparative-anchor-gpt-4-0314_BT,Anchor,0.8094681472822909,3.0737000941858494,0.08468363327726611,, mistral-large-instruct-2407,mistral-large-instruct-2407#0-100_verbalized-score_BT,Numeric,0.8094681472822909,3.0100161147388955,0.0820935147925871,, llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#0-100_verbalized-score_BT,Numeric,0.8047925189947399,4.330580224795526,0.08713575870563035,, gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#0-100_verbalized-score_BT,Numeric,0.8036236119228521,2.911340336840001,0.07690456122004496,, gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#bad-good_textual-score_BT,Likert,0.7977790765634132,4.610807214374205,0.08715933303084132,, llama-3-1-70b-instruct,llama-3-1-70b-instruct#0-100_verbalized-score_BT,Numeric,0.7977790765634132,2.6939668808867303,0.08683395704068392,, Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.7942723553477498,2.9295541683113755,0.08961997841795598,, llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#bad-good_textual-score_BT,Likert,0.7872589129164231,5.218423477156838,0.09726366816471475,, Skywork-Reward-Llama-3.1-8B-v0.2,Skywork-Reward-Llama-3.1-8B-v0.2_BT,Reward,0.7779076563413208,2.461196439206365,0.09968448349021375,, Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#good-yes-no_logprob-score_BT,TokenProbs,0.776738749269433,2.689252148396911,0.08165561425906073,, mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#0-100_verbalized-score_BT,Numeric,0.7755698421975452,2.123702380621778,0.08878149867714963,, gpt-4o-2024-11-20,gpt-4o-2024-11-20#0-100_verbalized-score_BT,Numeric,0.7744009351256574,2.147368211099292,0.07704891970422703,, gpt-4o-2024-11-20,gpt-4o-2024-11-20#bad-good_textual-score_BT,Likert,0.7728734618416162,5.485635896159162,0.08906791777991026,, llama-3-1-70b-instruct,llama-3-1-70b-instruct#good-yes-no_logprob-score_BT,TokenProbs,0.7650496785505552,1.2599940887461256,0.06974800184734668,, Llama-3-OffsetBias-RM-8B,Llama-3-OffsetBias-RM-8B_BT,Reward,0.7650496785505552,1.386859930640412,0.07566984800414184,, ArmoRM-Llama3-8B-v0.1,ArmoRM-Llama3-8B-v0.1_BT,Reward,0.7627118644067796,1.8398700318743302,0.09237283513647337,, gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#good-yes-no_logprob-score_BT,TokenProbs,0.7521917007597895,2.10259493695348,0.08401740959016844,, llama-3-70b-instruct,llama-3-70b-instruct#0-100_verbalized-score_BT,Numeric,0.7486849795441262,1.2738290052706196,0.0843328604031163,, Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#good-yes-no_logprob-score_BT,TokenProbs,0.7405026300409117,0.5983808410581415,0.06128229980875954,, mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#bad-good_textual-score_BT,Likert,0.7381648158971361,2.534301904660848,0.10758560218545998,, llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#comparative-anchor-gpt-4-0314_BT,Anchor,0.7299824663939216,3.577096074866618,0.11235985485344185,, mistral-large-instruct-2407,mistral-large-instruct-2407#comparative-anchor-gpt-4-0314_BT,Anchor,0.7253068381063704,2.1297623705935567,0.11110146913759139,, Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#bad-good_textual-score_BT,Likert,0.7229690239625949,0.9348572206360583,0.09020347195052465,, llama-3-1-70b-instruct,llama-3-1-70b-instruct#bad-good_textual-score_BT,Likert,0.7218001168907071,3.901943147622847,0.12009332653938945,, internlm2-20b-reward,internlm2-20b-reward_BT,Reward,0.7171244886031559,1.9003691605695128,0.09838917792721068,, internlm2-7b-reward,internlm2-7b-reward_BT,Reward,0.7124488603156048,2.353664499796978,0.11336430399297745,, GRM-Llama3.2-3B-rewardmodel-ft,GRM-Llama3.2-3B-rewardmodel-ft_BT,Reward,0.7112799532437171,2.302320479143431,0.11380131384129795,, mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#good-yes-no_logprob-score_BT,TokenProbs,0.7019286966686147,1.849571280249153,0.08844553785560817,, gpt-4o-2024-11-20,gpt-4o-2024-11-20#good-yes-no_logprob-score_BT,TokenProbs,0.7003800357687263,2.2241585234952765,0.09319651198050675,, llama-3-70b-instruct,llama-3-70b-instruct#bad-good_textual-score_BT,Likert,0.6984219754529515,2.4001241250045453,0.12200247480571864,, llama-3-1-70b-instruct,llama-3-1-70b-instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.6879018118059613,2.711477731374198,0.1262116303875313,, Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.6773816481589713,0.8684080176854654,0.08510985248268717,, llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#good-yes-no_logprob-score_BT,TokenProbs,0.6715371127995324,1.5497071579540496,0.09227087986533976,, Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#0-100_verbalized-score_BT,Numeric,0.668030391583869,1.2045215166190555,0.10431279729383011,, llama-3-70b-instruct,llama-3-70b-instruct#good-yes-no_logprob-score_BT,TokenProbs,0.6633547632963179,0.774852442203426,0.0712068298228739,, gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#comparative-anchor-gpt-4-0314_BT,Anchor,0.6586791350087667,1.4123201435751862,0.1108225332258117,, mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#0-100_verbalized-score_BT,Numeric,0.6563413208649912,1.2703499058409227,0.10233107571803857,, mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#comparative-anchor-gpt-4-0314_BT,Anchor,0.6551724137931034,1.1679716230736195,0.10168601661101805,, mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#comparative-anchor-gpt-4-0314_BT,Anchor,0.64114552893045,1.4971790245204495,0.1398327898420888,, llama-3-70b-instruct,llama-3-70b-instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.6329631794272355,1.8202412271839277,0.13196822136679537,, Eurus-RM-7b,Eurus-RM-7b_BT,Reward,0.6282875511396844,2.492726583183693,0.13811267469299746,, mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#bad-good_textual-score_BT,Likert,0.5897136177673874,0.8381223432288695,0.11017386542259801,, mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#good-yes-no_logprob-score_BT,TokenProbs,0.4272355347749853,0.7389819834751149,0.1070750522871957,, mistral-large-instruct-2407,mistral-large-instruct-2407#good-yes-no_logprob-score_BT,TokenProbs,0.3687901811805961,1.1652168815452648,0.12258728493955592,,