| { | |
| "results": { | |
| "hendrycksTest-abstract_algebra": { | |
| "acc": 0.27, | |
| "acc_stderr": 0.04461960433384741, | |
| "acc_norm": 0.27, | |
| "acc_norm_stderr": 0.04461960433384741 | |
| }, | |
| "hendrycksTest-anatomy": { | |
| "acc": 0.3925925925925926, | |
| "acc_stderr": 0.04218506215368879, | |
| "acc_norm": 0.3925925925925926, | |
| "acc_norm_stderr": 0.04218506215368879 | |
| }, | |
| "hendrycksTest-astronomy": { | |
| "acc": 0.3815789473684211, | |
| "acc_stderr": 0.03953173377749194, | |
| "acc_norm": 0.3815789473684211, | |
| "acc_norm_stderr": 0.03953173377749194 | |
| }, | |
| "hendrycksTest-business_ethics": { | |
| "acc": 0.4, | |
| "acc_stderr": 0.049236596391733084, | |
| "acc_norm": 0.4, | |
| "acc_norm_stderr": 0.049236596391733084 | |
| }, | |
| "hendrycksTest-clinical_knowledge": { | |
| "acc": 0.37735849056603776, | |
| "acc_stderr": 0.029832808114796005, | |
| "acc_norm": 0.37735849056603776, | |
| "acc_norm_stderr": 0.029832808114796005 | |
| }, | |
| "hendrycksTest-college_biology": { | |
| "acc": 0.3680555555555556, | |
| "acc_stderr": 0.040329990539607195, | |
| "acc_norm": 0.3680555555555556, | |
| "acc_norm_stderr": 0.040329990539607195 | |
| }, | |
| "hendrycksTest-college_chemistry": { | |
| "acc": 0.25, | |
| "acc_stderr": 0.04351941398892446, | |
| "acc_norm": 0.25, | |
| "acc_norm_stderr": 0.04351941398892446 | |
| }, | |
| "hendrycksTest-college_computer_science": { | |
| "acc": 0.37, | |
| "acc_stderr": 0.04852365870939099, | |
| "acc_norm": 0.37, | |
| "acc_norm_stderr": 0.04852365870939099 | |
| }, | |
| "hendrycksTest-college_mathematics": { | |
| "acc": 0.3, | |
| "acc_stderr": 0.046056618647183814, | |
| "acc_norm": 0.3, | |
| "acc_norm_stderr": 0.046056618647183814 | |
| }, | |
| "hendrycksTest-college_medicine": { | |
| "acc": 0.3236994219653179, | |
| "acc_stderr": 0.0356760379963917, | |
| "acc_norm": 0.3236994219653179, | |
| "acc_norm_stderr": 0.0356760379963917 | |
| }, | |
| "hendrycksTest-college_physics": { | |
| "acc": 0.22549019607843138, | |
| "acc_stderr": 0.04158307533083286, | |
| "acc_norm": 0.22549019607843138, | |
| "acc_norm_stderr": 0.04158307533083286 | |
| }, | |
| "hendrycksTest-computer_security": { | |
| "acc": 0.46, | |
| "acc_stderr": 0.05009082659620332, | |
| "acc_norm": 0.46, | |
| "acc_norm_stderr": 0.05009082659620332 | |
| }, | |
| "hendrycksTest-conceptual_physics": { | |
| "acc": 0.33617021276595743, | |
| "acc_stderr": 0.03088161852067694, | |
| "acc_norm": 0.33617021276595743, | |
| "acc_norm_stderr": 0.03088161852067694 | |
| }, | |
| "hendrycksTest-econometrics": { | |
| "acc": 0.3157894736842105, | |
| "acc_stderr": 0.04372748290278007, | |
| "acc_norm": 0.3157894736842105, | |
| "acc_norm_stderr": 0.04372748290278007 | |
| }, | |
| "hendrycksTest-electrical_engineering": { | |
| "acc": 0.3793103448275862, | |
| "acc_stderr": 0.04043461861916747, | |
| "acc_norm": 0.3793103448275862, | |
| "acc_norm_stderr": 0.04043461861916747 | |
| }, | |
| "hendrycksTest-elementary_mathematics": { | |
| "acc": 0.25925925925925924, | |
| "acc_stderr": 0.02256989707491841, | |
| "acc_norm": 0.25925925925925924, | |
| "acc_norm_stderr": 0.02256989707491841 | |
| }, | |
| "hendrycksTest-formal_logic": { | |
| "acc": 0.20634920634920634, | |
| "acc_stderr": 0.03619604524124251, | |
| "acc_norm": 0.20634920634920634, | |
| "acc_norm_stderr": 0.03619604524124251 | |
| }, | |
| "hendrycksTest-global_facts": { | |
| "acc": 0.3, | |
| "acc_stderr": 0.046056618647183814, | |
| "acc_norm": 0.3, | |
| "acc_norm_stderr": 0.046056618647183814 | |
| }, | |
| "hendrycksTest-high_school_biology": { | |
| "acc": 0.34838709677419355, | |
| "acc_stderr": 0.02710482632810094, | |
| "acc_norm": 0.34838709677419355, | |
| "acc_norm_stderr": 0.02710482632810094 | |
| }, | |
| "hendrycksTest-high_school_chemistry": { | |
| "acc": 0.3054187192118227, | |
| "acc_stderr": 0.03240661565868408, | |
| "acc_norm": 0.3054187192118227, | |
| "acc_norm_stderr": 0.03240661565868408 | |
| }, | |
| "hendrycksTest-high_school_computer_science": { | |
| "acc": 0.35, | |
| "acc_stderr": 0.047937248544110196, | |
| "acc_norm": 0.35, | |
| "acc_norm_stderr": 0.047937248544110196 | |
| }, | |
| "hendrycksTest-high_school_european_history": { | |
| "acc": 0.36363636363636365, | |
| "acc_stderr": 0.03756335775187896, | |
| "acc_norm": 0.36363636363636365, | |
| "acc_norm_stderr": 0.03756335775187896 | |
| }, | |
| "hendrycksTest-high_school_geography": { | |
| "acc": 0.3686868686868687, | |
| "acc_stderr": 0.03437305501980619, | |
| "acc_norm": 0.3686868686868687, | |
| "acc_norm_stderr": 0.03437305501980619 | |
| }, | |
| "hendrycksTest-high_school_government_and_politics": { | |
| "acc": 0.5181347150259067, | |
| "acc_stderr": 0.036060650018329185, | |
| "acc_norm": 0.5181347150259067, | |
| "acc_norm_stderr": 0.036060650018329185 | |
| }, | |
| "hendrycksTest-high_school_macroeconomics": { | |
| "acc": 0.34102564102564104, | |
| "acc_stderr": 0.024035489676335068, | |
| "acc_norm": 0.34102564102564104, | |
| "acc_norm_stderr": 0.024035489676335068 | |
| }, | |
| "hendrycksTest-high_school_mathematics": { | |
| "acc": 0.2851851851851852, | |
| "acc_stderr": 0.027528599210340496, | |
| "acc_norm": 0.2851851851851852, | |
| "acc_norm_stderr": 0.027528599210340496 | |
| }, | |
| "hendrycksTest-high_school_microeconomics": { | |
| "acc": 0.35294117647058826, | |
| "acc_stderr": 0.03104194130405927, | |
| "acc_norm": 0.35294117647058826, | |
| "acc_norm_stderr": 0.03104194130405927 | |
| }, | |
| "hendrycksTest-high_school_physics": { | |
| "acc": 0.31125827814569534, | |
| "acc_stderr": 0.03780445850526732, | |
| "acc_norm": 0.31125827814569534, | |
| "acc_norm_stderr": 0.03780445850526732 | |
| }, | |
| "hendrycksTest-high_school_psychology": { | |
| "acc": 0.4018348623853211, | |
| "acc_stderr": 0.02102010617299701, | |
| "acc_norm": 0.4018348623853211, | |
| "acc_norm_stderr": 0.02102010617299701 | |
| }, | |
| "hendrycksTest-high_school_statistics": { | |
| "acc": 0.3148148148148148, | |
| "acc_stderr": 0.03167468706828979, | |
| "acc_norm": 0.3148148148148148, | |
| "acc_norm_stderr": 0.03167468706828979 | |
| }, | |
| "hendrycksTest-high_school_us_history": { | |
| "acc": 0.37254901960784315, | |
| "acc_stderr": 0.033933885849584025, | |
| "acc_norm": 0.37254901960784315, | |
| "acc_norm_stderr": 0.033933885849584025 | |
| }, | |
| "hendrycksTest-high_school_world_history": { | |
| "acc": 0.4345991561181435, | |
| "acc_stderr": 0.03226759995510144, | |
| "acc_norm": 0.4345991561181435, | |
| "acc_norm_stderr": 0.03226759995510144 | |
| }, | |
| "hendrycksTest-human_aging": { | |
| "acc": 0.42152466367713004, | |
| "acc_stderr": 0.03314190222110658, | |
| "acc_norm": 0.42152466367713004, | |
| "acc_norm_stderr": 0.03314190222110658 | |
| }, | |
| "hendrycksTest-human_sexuality": { | |
| "acc": 0.3969465648854962, | |
| "acc_stderr": 0.04291135671009224, | |
| "acc_norm": 0.3969465648854962, | |
| "acc_norm_stderr": 0.04291135671009224 | |
| }, | |
| "hendrycksTest-international_law": { | |
| "acc": 0.5371900826446281, | |
| "acc_stderr": 0.04551711196104218, | |
| "acc_norm": 0.5371900826446281, | |
| "acc_norm_stderr": 0.04551711196104218 | |
| }, | |
| "hendrycksTest-jurisprudence": { | |
| "acc": 0.35185185185185186, | |
| "acc_stderr": 0.046166311118017146, | |
| "acc_norm": 0.35185185185185186, | |
| "acc_norm_stderr": 0.046166311118017146 | |
| }, | |
| "hendrycksTest-logical_fallacies": { | |
| "acc": 0.3374233128834356, | |
| "acc_stderr": 0.037149084099355745, | |
| "acc_norm": 0.3374233128834356, | |
| "acc_norm_stderr": 0.037149084099355745 | |
| }, | |
| "hendrycksTest-machine_learning": { | |
| "acc": 0.2857142857142857, | |
| "acc_stderr": 0.042878587513404544, | |
| "acc_norm": 0.2857142857142857, | |
| "acc_norm_stderr": 0.042878587513404544 | |
| }, | |
| "hendrycksTest-management": { | |
| "acc": 0.36893203883495146, | |
| "acc_stderr": 0.04777615181156739, | |
| "acc_norm": 0.36893203883495146, | |
| "acc_norm_stderr": 0.04777615181156739 | |
| }, | |
| "hendrycksTest-marketing": { | |
| "acc": 0.45726495726495725, | |
| "acc_stderr": 0.03263622596380688, | |
| "acc_norm": 0.45726495726495725, | |
| "acc_norm_stderr": 0.03263622596380688 | |
| }, | |
| "hendrycksTest-medical_genetics": { | |
| "acc": 0.44, | |
| "acc_stderr": 0.04988876515698589, | |
| "acc_norm": 0.44, | |
| "acc_norm_stderr": 0.04988876515698589 | |
| }, | |
| "hendrycksTest-miscellaneous": { | |
| "acc": 0.4763729246487867, | |
| "acc_stderr": 0.01785998976517645, | |
| "acc_norm": 0.4763729246487867, | |
| "acc_norm_stderr": 0.01785998976517645 | |
| }, | |
| "hendrycksTest-moral_disputes": { | |
| "acc": 0.3699421965317919, | |
| "acc_stderr": 0.025992472029306386, | |
| "acc_norm": 0.3699421965317919, | |
| "acc_norm_stderr": 0.025992472029306386 | |
| }, | |
| "hendrycksTest-moral_scenarios": { | |
| "acc": 0.23798882681564246, | |
| "acc_stderr": 0.01424263007057488, | |
| "acc_norm": 0.23798882681564246, | |
| "acc_norm_stderr": 0.01424263007057488 | |
| }, | |
| "hendrycksTest-nutrition": { | |
| "acc": 0.4150326797385621, | |
| "acc_stderr": 0.028213504177824103, | |
| "acc_norm": 0.4150326797385621, | |
| "acc_norm_stderr": 0.028213504177824103 | |
| }, | |
| "hendrycksTest-philosophy": { | |
| "acc": 0.4405144694533762, | |
| "acc_stderr": 0.028196400574197426, | |
| "acc_norm": 0.4405144694533762, | |
| "acc_norm_stderr": 0.028196400574197426 | |
| }, | |
| "hendrycksTest-prehistory": { | |
| "acc": 0.37962962962962965, | |
| "acc_stderr": 0.02700252103451648, | |
| "acc_norm": 0.37962962962962965, | |
| "acc_norm_stderr": 0.02700252103451648 | |
| }, | |
| "hendrycksTest-professional_accounting": { | |
| "acc": 0.29432624113475175, | |
| "acc_stderr": 0.02718712701150379, | |
| "acc_norm": 0.29432624113475175, | |
| "acc_norm_stderr": 0.02718712701150379 | |
| }, | |
| "hendrycksTest-professional_law": { | |
| "acc": 0.30378096479791394, | |
| "acc_stderr": 0.011745787720472472, | |
| "acc_norm": 0.30378096479791394, | |
| "acc_norm_stderr": 0.011745787720472472 | |
| }, | |
| "hendrycksTest-professional_medicine": { | |
| "acc": 0.40808823529411764, | |
| "acc_stderr": 0.029855261393483924, | |
| "acc_norm": 0.40808823529411764, | |
| "acc_norm_stderr": 0.029855261393483924 | |
| }, | |
| "hendrycksTest-professional_psychology": { | |
| "acc": 0.35294117647058826, | |
| "acc_stderr": 0.01933314202079706, | |
| "acc_norm": 0.35294117647058826, | |
| "acc_norm_stderr": 0.01933314202079706 | |
| }, | |
| "hendrycksTest-public_relations": { | |
| "acc": 0.4090909090909091, | |
| "acc_stderr": 0.04709306978661896, | |
| "acc_norm": 0.4090909090909091, | |
| "acc_norm_stderr": 0.04709306978661896 | |
| }, | |
| "hendrycksTest-security_studies": { | |
| "acc": 0.3224489795918367, | |
| "acc_stderr": 0.029923100563683906, | |
| "acc_norm": 0.3224489795918367, | |
| "acc_norm_stderr": 0.029923100563683906 | |
| }, | |
| "hendrycksTest-sociology": { | |
| "acc": 0.42786069651741293, | |
| "acc_stderr": 0.03498541988407795, | |
| "acc_norm": 0.42786069651741293, | |
| "acc_norm_stderr": 0.03498541988407795 | |
| }, | |
| "hendrycksTest-us_foreign_policy": { | |
| "acc": 0.53, | |
| "acc_stderr": 0.05016135580465919, | |
| "acc_norm": 0.53, | |
| "acc_norm_stderr": 0.05016135580465919 | |
| }, | |
| "hendrycksTest-virology": { | |
| "acc": 0.3614457831325301, | |
| "acc_stderr": 0.037400593820293204, | |
| "acc_norm": 0.3614457831325301, | |
| "acc_norm_stderr": 0.037400593820293204 | |
| }, | |
| "hendrycksTest-world_religions": { | |
| "acc": 0.4678362573099415, | |
| "acc_stderr": 0.038268824176603704, | |
| "acc_norm": 0.4678362573099415, | |
| "acc_norm_stderr": 0.038268824176603704 | |
| } | |
| }, | |
| "versions": { | |
| "hendrycksTest-abstract_algebra": 1, | |
| "hendrycksTest-anatomy": 1, | |
| "hendrycksTest-astronomy": 1, | |
| "hendrycksTest-business_ethics": 1, | |
| "hendrycksTest-clinical_knowledge": 1, | |
| "hendrycksTest-college_biology": 1, | |
| "hendrycksTest-college_chemistry": 1, | |
| "hendrycksTest-college_computer_science": 1, | |
| "hendrycksTest-college_mathematics": 1, | |
| "hendrycksTest-college_medicine": 1, | |
| "hendrycksTest-college_physics": 1, | |
| "hendrycksTest-computer_security": 1, | |
| "hendrycksTest-conceptual_physics": 1, | |
| "hendrycksTest-econometrics": 1, | |
| "hendrycksTest-electrical_engineering": 1, | |
| "hendrycksTest-elementary_mathematics": 1, | |
| "hendrycksTest-formal_logic": 1, | |
| "hendrycksTest-global_facts": 1, | |
| "hendrycksTest-high_school_biology": 1, | |
| "hendrycksTest-high_school_chemistry": 1, | |
| "hendrycksTest-high_school_computer_science": 1, | |
| "hendrycksTest-high_school_european_history": 1, | |
| "hendrycksTest-high_school_geography": 1, | |
| "hendrycksTest-high_school_government_and_politics": 1, | |
| "hendrycksTest-high_school_macroeconomics": 1, | |
| "hendrycksTest-high_school_mathematics": 1, | |
| "hendrycksTest-high_school_microeconomics": 1, | |
| "hendrycksTest-high_school_physics": 1, | |
| "hendrycksTest-high_school_psychology": 1, | |
| "hendrycksTest-high_school_statistics": 1, | |
| "hendrycksTest-high_school_us_history": 1, | |
| "hendrycksTest-high_school_world_history": 1, | |
| "hendrycksTest-human_aging": 1, | |
| "hendrycksTest-human_sexuality": 1, | |
| "hendrycksTest-international_law": 1, | |
| "hendrycksTest-jurisprudence": 1, | |
| "hendrycksTest-logical_fallacies": 1, | |
| "hendrycksTest-machine_learning": 1, | |
| "hendrycksTest-management": 1, | |
| "hendrycksTest-marketing": 1, | |
| "hendrycksTest-medical_genetics": 1, | |
| "hendrycksTest-miscellaneous": 1, | |
| "hendrycksTest-moral_disputes": 1, | |
| "hendrycksTest-moral_scenarios": 1, | |
| "hendrycksTest-nutrition": 1, | |
| "hendrycksTest-philosophy": 1, | |
| "hendrycksTest-prehistory": 1, | |
| "hendrycksTest-professional_accounting": 1, | |
| "hendrycksTest-professional_law": 1, | |
| "hendrycksTest-professional_medicine": 1, | |
| "hendrycksTest-professional_psychology": 1, | |
| "hendrycksTest-public_relations": 1, | |
| "hendrycksTest-security_studies": 1, | |
| "hendrycksTest-sociology": 1, | |
| "hendrycksTest-us_foreign_policy": 1, | |
| "hendrycksTest-virology": 1, | |
| "hendrycksTest-world_religions": 1 | |
| }, | |
| "config": { | |
| "model": "sparseml", | |
| "model_args": "pretrained=/network/alexandre/research/cerebras/llama2_7B_sparse70_retrained/checkpoint,dtype=bfloat16", | |
| "num_fewshot": 5, | |
| "batch_size": "4", | |
| "batch_sizes": [], | |
| "device": "cuda:6", | |
| "no_cache": true, | |
| "limit": null, | |
| "bootstrap_iters": 100000, | |
| "description_dict": {} | |
| } | |
| } |