alielfilali01 commited on
Commit
fbf86f4
·
verified ·
1 Parent(s): 2d6b85b

Update results.json with latest aggregated results.

Browse files
Files changed (1) hide show
  1. assets/results/results.json +391 -1
assets/results/results.json CHANGED
@@ -1350,6 +1350,396 @@
1350
  }
1351
  },
1352
  {
1353
- "_last_sync_timestamp": "2024-12-15T21:20:51.136159"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1354
  }
1355
  ]
 
1350
  }
1351
  },
1352
  {
1353
+ "claude-3.5-sonnet Scores": {
1354
+ "3C3H Scores": {
1355
+ "Correctness": 0.4791,
1356
+ "Completeness": 0.4433,
1357
+ "Conciseness": 0.2109,
1358
+ "Helpfulness": 0.434,
1359
+ "Honesty": 0.466,
1360
+ "Harmlessness": 0.4773,
1361
+ "3C3H Score": 0.4184
1362
+ },
1363
+ "Tasks Scores": {
1364
+ "Question Answering (QA)": 0.4969,
1365
+ "Reasoning": 0.4778,
1366
+ "Orthographic and Grammatical Analysis": 0.0,
1367
+ "Safety": 0.2437
1368
+ }
1369
+ },
1370
+ "Meta": {
1371
+ "Model Name": "CohereForAI/aya-23-8B",
1372
+ "License": "cc-by-nc-4.0",
1373
+ "Revision": "main",
1374
+ "Precision": "float16",
1375
+ "Params": 8.0,
1376
+ "Total Entries": 279,
1377
+ "Successful Entries": 279,
1378
+ "Failed Entries": 0,
1379
+ "Success Ratio": 1.0
1380
+ }
1381
+ },
1382
+ {
1383
+ "claude-3.5-sonnet Scores": {
1384
+ "3C3H Scores": {
1385
+ "Correctness": 0.4636,
1386
+ "Completeness": 0.4409,
1387
+ "Conciseness": 0.1532,
1388
+ "Helpfulness": 0.4062,
1389
+ "Honesty": 0.4379,
1390
+ "Harmlessness": 0.4636,
1391
+ "3C3H Score": 0.3942
1392
+ },
1393
+ "Tasks Scores": {
1394
+ "Question Answering (QA)": 0.4683,
1395
+ "Reasoning": 0.4106,
1396
+ "Orthographic and Grammatical Analysis": 0.0,
1397
+ "Safety": 0.3771
1398
+ }
1399
+ },
1400
+ "Meta": {
1401
+ "Model Name": "inceptionai/jais-adapted-7b-chat",
1402
+ "License": "apache-2.0",
1403
+ "Revision": "main",
1404
+ "Precision": "float32",
1405
+ "Params": 7.0,
1406
+ "Total Entries": 279,
1407
+ "Successful Entries": 279,
1408
+ "Failed Entries": 0,
1409
+ "Success Ratio": 1.0
1410
+ }
1411
+ },
1412
+ {
1413
+ "claude-3.5-sonnet Scores": {
1414
+ "3C3H Scores": {
1415
+ "Correctness": 0.6822,
1416
+ "Completeness": 0.6643,
1417
+ "Conciseness": 0.2398,
1418
+ "Helpfulness": 0.6461,
1419
+ "Honesty": 0.6723,
1420
+ "Harmlessness": 0.6813,
1421
+ "3C3H Score": 0.5977
1422
+ },
1423
+ "Tasks Scores": {
1424
+ "Question Answering (QA)": 0.7304,
1425
+ "Reasoning": 0.5472,
1426
+ "Orthographic and Grammatical Analysis": 0.2124,
1427
+ "Safety": 0.3687
1428
+ }
1429
+ },
1430
+ "Meta": {
1431
+ "Model Name": "CohereForAI/c4ai-command-r-plus",
1432
+ "License": "cc-by-nc-4.0",
1433
+ "Revision": "main",
1434
+ "Precision": "float16",
1435
+ "Params": 104.0,
1436
+ "Total Entries": 279,
1437
+ "Successful Entries": 279,
1438
+ "Failed Entries": 0,
1439
+ "Success Ratio": 1.0
1440
+ }
1441
+ },
1442
+ {
1443
+ "claude-3.5-sonnet Scores": {
1444
+ "3C3H Scores": {
1445
+ "Correctness": 0.5144,
1446
+ "Completeness": 0.5096,
1447
+ "Conciseness": 0.1304,
1448
+ "Helpfulness": 0.4829,
1449
+ "Honesty": 0.4922,
1450
+ "Harmlessness": 0.5135,
1451
+ "3C3H Score": 0.4405
1452
+ },
1453
+ "Tasks Scores": {
1454
+ "Question Answering (QA)": 0.4967,
1455
+ "Reasoning": 0.5361,
1456
+ "Orthographic and Grammatical Analysis": 0.0,
1457
+ "Safety": 0.3375
1458
+ }
1459
+ },
1460
+ "Meta": {
1461
+ "Model Name": "CohereForAI/c4ai-command-r7b-12-2024",
1462
+ "License": "cc-by-nc-4.0",
1463
+ "Revision": "main",
1464
+ "Precision": "bfloat16",
1465
+ "Params": 8.0,
1466
+ "Total Entries": 279,
1467
+ "Successful Entries": 278,
1468
+ "Failed Entries": 1,
1469
+ "Success Ratio": 0.9964
1470
+ }
1471
+ },
1472
+ {
1473
+ "claude-3.5-sonnet Scores": {
1474
+ "3C3H Scores": {
1475
+ "Correctness": 0.6511,
1476
+ "Completeness": 0.6499,
1477
+ "Conciseness": 0.1948,
1478
+ "Helpfulness": 0.634,
1479
+ "Honesty": 0.6415,
1480
+ "Harmlessness": 0.6505,
1481
+ "3C3H Score": 0.5703
1482
+ },
1483
+ "Tasks Scores": {
1484
+ "Question Answering (QA)": 0.6214,
1485
+ "Reasoning": 0.6911,
1486
+ "Orthographic and Grammatical Analysis": 0.0,
1487
+ "Safety": 0.6125
1488
+ }
1489
+ },
1490
+ "Meta": {
1491
+ "Model Name": "Qwen/Qwen2.5-32B-Instruct",
1492
+ "License": "apache-2.0",
1493
+ "Revision": "main",
1494
+ "Precision": "bfloat16",
1495
+ "Params": 32.0,
1496
+ "Total Entries": 279,
1497
+ "Successful Entries": 278,
1498
+ "Failed Entries": 1,
1499
+ "Success Ratio": 0.9964
1500
+ }
1501
+ },
1502
+ {
1503
+ "claude-3.5-sonnet Scores": {
1504
+ "3C3H Scores": {
1505
+ "Correctness": 0.546,
1506
+ "Completeness": 0.5448,
1507
+ "Conciseness": 0.1559,
1508
+ "Helpfulness": 0.5233,
1509
+ "Honesty": 0.532,
1510
+ "Harmlessness": 0.5457,
1511
+ "3C3H Score": 0.4746
1512
+ },
1513
+ "Tasks Scores": {
1514
+ "Question Answering (QA)": 0.482,
1515
+ "Reasoning": 0.6222,
1516
+ "Orthographic and Grammatical Analysis": 0.0,
1517
+ "Safety": 0.6
1518
+ }
1519
+ },
1520
+ "Meta": {
1521
+ "Model Name": "Qwen/Qwen2.5-7B-Instruct",
1522
+ "License": "apache-2.0",
1523
+ "Revision": "main",
1524
+ "Precision": "bfloat16",
1525
+ "Params": 7.0,
1526
+ "Total Entries": 279,
1527
+ "Successful Entries": 279,
1528
+ "Failed Entries": 0,
1529
+ "Success Ratio": 1.0
1530
+ }
1531
+ },
1532
+ {
1533
+ "claude-3.5-sonnet Scores": {
1534
+ "3C3H Scores": {
1535
+ "Correctness": 0.4676,
1536
+ "Completeness": 0.464,
1537
+ "Conciseness": 0.1361,
1538
+ "Helpfulness": 0.4047,
1539
+ "Honesty": 0.4158,
1540
+ "Harmlessness": 0.4658,
1541
+ "3C3H Score": 0.3923
1542
+ },
1543
+ "Tasks Scores": {
1544
+ "Question Answering (QA)": 0.427,
1545
+ "Reasoning": 0.4289,
1546
+ "Orthographic and Grammatical Analysis": 0.0,
1547
+ "Safety": 0.6
1548
+ }
1549
+ },
1550
+ "Meta": {
1551
+ "Model Name": "meta-llama/Llama-3.2-11B-Vision-Instruct",
1552
+ "License": "llama3.2",
1553
+ "Revision": "main",
1554
+ "Precision": "bfloat16",
1555
+ "Params": 11.0,
1556
+ "Total Entries": 279,
1557
+ "Successful Entries": 278,
1558
+ "Failed Entries": 1,
1559
+ "Success Ratio": 0.9964
1560
+ }
1561
+ },
1562
+ {
1563
+ "claude-3.5-sonnet Scores": {
1564
+ "3C3H Scores": {
1565
+ "Correctness": 0.5863,
1566
+ "Completeness": 0.5803,
1567
+ "Conciseness": 0.2338,
1568
+ "Helpfulness": 0.5659,
1569
+ "Honesty": 0.5782,
1570
+ "Harmlessness": 0.5854,
1571
+ "3C3H Score": 0.5217
1572
+ },
1573
+ "Tasks Scores": {
1574
+ "Question Answering (QA)": 0.5484,
1575
+ "Reasoning": 0.6389,
1576
+ "Orthographic and Grammatical Analysis": 0.0188,
1577
+ "Safety": 0.6583
1578
+ }
1579
+ },
1580
+ "Meta": {
1581
+ "Model Name": "FreedomIntelligence/AceGPT-v2-32B-Chat",
1582
+ "License": "apache-2.0",
1583
+ "Revision": "main",
1584
+ "Precision": "float16",
1585
+ "Params": 32.0,
1586
+ "Total Entries": 279,
1587
+ "Successful Entries": 278,
1588
+ "Failed Entries": 1,
1589
+ "Success Ratio": 0.9964
1590
+ }
1591
+ },
1592
+ {
1593
+ "claude-3.5-sonnet Scores": {
1594
+ "3C3H Scores": {
1595
+ "Correctness": 0.4277,
1596
+ "Completeness": 0.3955,
1597
+ "Conciseness": 0.0687,
1598
+ "Helpfulness": 0.3127,
1599
+ "Honesty": 0.3668,
1600
+ "Harmlessness": 0.4232,
1601
+ "3C3H Score": 0.3324
1602
+ },
1603
+ "Tasks Scores": {
1604
+ "Question Answering (QA)": 0.3284,
1605
+ "Reasoning": 0.4578,
1606
+ "Orthographic and Grammatical Analysis": 0.0,
1607
+ "Safety": 0.4083
1608
+ }
1609
+ },
1610
+ "Meta": {
1611
+ "Model Name": "Qwen/QwQ-32B-Preview",
1612
+ "License": "apache-2.0",
1613
+ "Revision": "main",
1614
+ "Precision": "bfloat16",
1615
+ "Params": 32.0,
1616
+ "Total Entries": 279,
1617
+ "Successful Entries": 279,
1618
+ "Failed Entries": 0,
1619
+ "Success Ratio": 1.0
1620
+ }
1621
+ },
1622
+ {
1623
+ "claude-3.5-sonnet Scores": {
1624
+ "3C3H Scores": {
1625
+ "Correctness": 0.6558,
1626
+ "Completeness": 0.6486,
1627
+ "Conciseness": 0.1895,
1628
+ "Helpfulness": 0.6276,
1629
+ "Honesty": 0.6402,
1630
+ "Harmlessness": 0.6552,
1631
+ "3C3H Score": 0.5695
1632
+ },
1633
+ "Tasks Scores": {
1634
+ "Question Answering (QA)": 0.6239,
1635
+ "Reasoning": 0.7094,
1636
+ "Orthographic and Grammatical Analysis": 0.0,
1637
+ "Safety": 0.5167
1638
+ }
1639
+ },
1640
+ "Meta": {
1641
+ "Model Name": "maldv/Qwentile2.5-32B-Instruct",
1642
+ "License": "Open",
1643
+ "Revision": "main",
1644
+ "Precision": "float16",
1645
+ "Params": 32.0,
1646
+ "Total Entries": 279,
1647
+ "Successful Entries": 277,
1648
+ "Failed Entries": 2,
1649
+ "Success Ratio": 0.9928
1650
+ }
1651
+ },
1652
+ {
1653
+ "claude-3.5-sonnet Scores": {
1654
+ "3C3H Scores": {
1655
+ "Correctness": 0.8189,
1656
+ "Completeness": 0.8189,
1657
+ "Conciseness": 0.2113,
1658
+ "Helpfulness": 0.7953,
1659
+ "Honesty": 0.8132,
1660
+ "Harmlessness": 0.8189,
1661
+ "3C3H Score": 0.7128
1662
+ },
1663
+ "Tasks Scores": {
1664
+ "Question Answering (QA)": 0.7792,
1665
+ "Reasoning": 0.7222,
1666
+ "Orthographic and Grammatical Analysis": 0.5202,
1667
+ "Safety": 0.4708
1668
+ }
1669
+ },
1670
+ "Meta": {
1671
+ "Model Name": "deepseek-chat",
1672
+ "License": "Proprietary",
1673
+ "Revision": "UNK",
1674
+ "Precision": "UNK",
1675
+ "Params": "UNK",
1676
+ "Total Entries": 279,
1677
+ "Successful Entries": 278,
1678
+ "Failed Entries": 1,
1679
+ "Success Ratio": 0.9964
1680
+ }
1681
+ },
1682
+ {
1683
+ "claude-3.5-sonnet Scores": {
1684
+ "3C3H Scores": {
1685
+ "Correctness": 0.7443,
1686
+ "Completeness": 0.7336,
1687
+ "Conciseness": 0.3056,
1688
+ "Helpfulness": 0.7234,
1689
+ "Honesty": 0.733,
1690
+ "Harmlessness": 0.7443,
1691
+ "3C3H Score": 0.664
1692
+ },
1693
+ "Tasks Scores": {
1694
+ "Question Answering (QA)": 0.7161,
1695
+ "Reasoning": 0.715,
1696
+ "Orthographic and Grammatical Analysis": 0.2352,
1697
+ "Safety": 0.7396
1698
+ }
1699
+ },
1700
+ "Meta": {
1701
+ "Model Name": "claude-3-5-haiku-20241022",
1702
+ "License": "Proprietary",
1703
+ "Revision": "UNK",
1704
+ "Precision": "UNK",
1705
+ "Params": "UNK",
1706
+ "Total Entries": 279,
1707
+ "Successful Entries": 279,
1708
+ "Failed Entries": 0,
1709
+ "Success Ratio": 1.0
1710
+ }
1711
+ },
1712
+ {
1713
+ "claude-3.5-sonnet Scores": {
1714
+ "3C3H Scores": {
1715
+ "Correctness": 0.5914,
1716
+ "Completeness": 0.589,
1717
+ "Conciseness": 0.1974,
1718
+ "Helpfulness": 0.5648,
1719
+ "Honesty": 0.5792,
1720
+ "Harmlessness": 0.5914,
1721
+ "3C3H Score": 0.5189
1722
+ },
1723
+ "Tasks Scores": {
1724
+ "Question Answering (QA)": 0.5998,
1725
+ "Reasoning": 0.5878,
1726
+ "Orthographic and Grammatical Analysis": 0.0,
1727
+ "Safety": 0.4458
1728
+ }
1729
+ },
1730
+ "Meta": {
1731
+ "Model Name": "gpt-3.5-turbo-0125",
1732
+ "License": "Proprietary",
1733
+ "Revision": "UNK",
1734
+ "Precision": "UNK",
1735
+ "Params": "UNK",
1736
+ "Total Entries": 279,
1737
+ "Successful Entries": 279,
1738
+ "Failed Entries": 0,
1739
+ "Success Ratio": 1.0
1740
+ }
1741
+ },
1742
+ {
1743
+ "_last_sync_timestamp": "2025-01-06T10:51:38.943959"
1744
  }
1745
  ]