LexGPT-V3 / README.md
lex-hue's picture
Update README.md
95bef32 verified
|
raw
history blame
14.4 kB
metadata
license: mit
language:
  - en
  - de

This Model was just an Test Train to see how our new Training Algorithm and Data does like.

Model is based on Mistral v0.1

As this was an test run, we just tested it and heres the Data, the model hasnt Improved any better.

Model Turn 1 Score Turn 2 Score Average Score
gpt-4 8.95625 9.025000 8.990625
gpt-3.5-turbo 8.075000 7.943750 7.943750
claude-v1 8.150000 7.900000 8.025000
LexGPT-V3 8.14375 7.719355 7.926667
vicuna-13b-v1.3 6.812500 5.962500 6.387500

Open-LLM Leaderboard Results: ''' { "all": { "acc": 0.647154984215818, "acc_stderr": 0.03221441224437104, "acc_norm": 0.6487599114885558, "acc_norm_stderr": 0.032860268812293904, "mc1": 0.4283965728274174, "mc1_stderr": 0.017323088597314757, "mc2": 0.5998074537794252, "mc2_stderr": 0.015494960379071198 }, "harness|arc:challenge|25": { "acc": 0.64419795221843, "acc_stderr": 0.01399057113791876, "acc_norm": 0.6646757679180887, "acc_norm_stderr": 0.013796182947785562 }, "harness|hellaswag|10": { "acc": 0.6782513443537144, "acc_stderr": 0.004661924314756093, "acc_norm": 0.8590918143796057, "acc_norm_stderr": 0.003472157511639361 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5925925925925926, "acc_stderr": 0.04244633238353227, "acc_norm": 0.5925925925925926, "acc_norm_stderr": 0.04244633238353227 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6973684210526315, "acc_stderr": 0.03738520676119667, "acc_norm": 0.6973684210526315, "acc_norm_stderr": 0.03738520676119667 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.59, "acc_stderr": 0.04943110704237102, "acc_norm": 0.59, "acc_norm_stderr": 0.04943110704237102 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7132075471698113, "acc_stderr": 0.027834912527544057, "acc_norm": 0.7132075471698113, "acc_norm_stderr": 0.027834912527544057 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7847222222222222, "acc_stderr": 0.03437079344106135, "acc_norm": 0.7847222222222222, "acc_norm_stderr": 0.03437079344106135 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.49, "acc_stderr": 0.05024183937956912, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.54, "acc_stderr": 0.05009082659620332, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620332 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.653179190751445, "acc_stderr": 0.036291466701596636, "acc_norm": 0.653179190751445, "acc_norm_stderr": 0.036291466701596636 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4019607843137255, "acc_stderr": 0.04878608714466996, "acc_norm": 0.4019607843137255, "acc_norm_stderr": 0.04878608714466996 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.76, "acc_stderr": 0.042923469599092816, "acc_norm": 0.76, "acc_norm_stderr": 0.042923469599092816 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5914893617021276, "acc_stderr": 0.032134180267015755, "acc_norm": 0.5914893617021276, "acc_norm_stderr": 0.032134180267015755 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.45614035087719296, "acc_stderr": 0.04685473041907789, "acc_norm": 0.45614035087719296, "acc_norm_stderr": 0.04685473041907789 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5379310344827586, "acc_stderr": 0.041546596717075474, "acc_norm": 0.5379310344827586, "acc_norm_stderr": 0.041546596717075474 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.4126984126984127, "acc_stderr": 0.02535574126305527, "acc_norm": 0.4126984126984127, "acc_norm_stderr": 0.02535574126305527 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4603174603174603, "acc_stderr": 0.04458029125470973, "acc_norm": 0.4603174603174603, "acc_norm_stderr": 0.04458029125470973 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7967741935483871, "acc_stderr": 0.02289168798455496, "acc_norm": 0.7967741935483871, "acc_norm_stderr": 0.02289168798455496 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5073891625615764, "acc_stderr": 0.035176035403610105, "acc_norm": 0.5073891625615764, "acc_norm_stderr": 0.035176035403610105 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.71, "acc_stderr": 0.045604802157206845, "acc_norm": 0.71, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7878787878787878, "acc_stderr": 0.031922715695483, "acc_norm": 0.7878787878787878, "acc_norm_stderr": 0.031922715695483 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7878787878787878, "acc_stderr": 0.029126522834586815, "acc_norm": 0.7878787878787878, "acc_norm_stderr": 0.029126522834586815 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8963730569948186, "acc_stderr": 0.021995311963644237, "acc_norm": 0.8963730569948186, "acc_norm_stderr": 0.021995311963644237 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6846153846153846, "acc_stderr": 0.02355964698318994, "acc_norm": 0.6846153846153846, "acc_norm_stderr": 0.02355964698318994 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.35185185185185186, "acc_stderr": 0.02911661760608301, "acc_norm": 0.35185185185185186, "acc_norm_stderr": 0.02911661760608301 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.7100840336134454, "acc_stderr": 0.029472485833136094, "acc_norm": 0.7100840336134454, "acc_norm_stderr": 0.029472485833136094 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.3708609271523179, "acc_stderr": 0.03943966699183629, "acc_norm": 0.3708609271523179, "acc_norm_stderr": 0.03943966699183629 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.8293577981651377, "acc_stderr": 0.016129271025099857, "acc_norm": 0.8293577981651377, "acc_norm_stderr": 0.016129271025099857 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.5462962962962963, "acc_stderr": 0.033953227263757976, "acc_norm": 0.5462962962962963, "acc_norm_stderr": 0.033953227263757976 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.8333333333333334, "acc_stderr": 0.026156867523931045, "acc_norm": 0.8333333333333334, "acc_norm_stderr": 0.026156867523931045 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.8227848101265823, "acc_stderr": 0.024856364184503224, "acc_norm": 0.8227848101265823, "acc_norm_stderr": 0.024856364184503224 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.7130044843049327, "acc_stderr": 0.03036037971029195, "acc_norm": 0.7130044843049327, "acc_norm_stderr": 0.03036037971029195 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.7709923664122137, "acc_stderr": 0.036853466317118506, "acc_norm": 0.7709923664122137, "acc_norm_stderr": 0.036853466317118506 }, "harness|hendrycksTest-international_law|5": { "acc": 0.7520661157024794, "acc_stderr": 0.03941897526516302, "acc_norm": 0.7520661157024794, "acc_norm_stderr": 0.03941897526516302 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.7592592592592593, "acc_stderr": 0.04133119440243839, "acc_norm": 0.7592592592592593, "acc_norm_stderr": 0.04133119440243839 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.754601226993865, "acc_stderr": 0.03380939813943354, "acc_norm": 0.754601226993865, "acc_norm_stderr": 0.03380939813943354 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.5089285714285714, "acc_stderr": 0.04745033255489123, "acc_norm": 0.5089285714285714, "acc_norm_stderr": 0.04745033255489123 }, "harness|hendrycksTest-management|5": { "acc": 0.7378640776699029, "acc_stderr": 0.043546310772605956, "acc_norm": 0.7378640776699029, "acc_norm_stderr": 0.043546310772605956 }, "harness|hendrycksTest-marketing|5": { "acc": 0.8547008547008547, "acc_stderr": 0.023086635086841407, "acc_norm": 0.8547008547008547, "acc_norm_stderr": 0.023086635086841407 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.72, "acc_stderr": 0.045126085985421276, "acc_norm": 0.72, "acc_norm_stderr": 0.045126085985421276 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.8186462324393359, "acc_stderr": 0.01377869377846408, "acc_norm": 0.8186462324393359, "acc_norm_stderr": 0.01377869377846408 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.7341040462427746, "acc_stderr": 0.023786203255508283, "acc_norm": 0.7341040462427746, "acc_norm_stderr": 0.023786203255508283 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.329608938547486, "acc_stderr": 0.01572153107518388, "acc_norm": 0.329608938547486, "acc_norm_stderr": 0.01572153107518388 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7450980392156863, "acc_stderr": 0.02495418432487991, "acc_norm": 0.7450980392156863, "acc_norm_stderr": 0.02495418432487991 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.6913183279742765, "acc_stderr": 0.026236965881153266, "acc_norm": 0.6913183279742765, "acc_norm_stderr": 0.026236965881153266 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.7314814814814815, "acc_stderr": 0.02465968518596728, "acc_norm": 0.7314814814814815, "acc_norm_stderr": 0.02465968518596728 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.49645390070921985, "acc_stderr": 0.02982674915328092, "acc_norm": 0.49645390070921985, "acc_norm_stderr": 0.02982674915328092 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.4817470664928292, "acc_stderr": 0.012761723960595472, "acc_norm": 0.4817470664928292, "acc_norm_stderr": 0.012761723960595472 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.6875, "acc_stderr": 0.02815637344037142, "acc_norm": 0.6875, "acc_norm_stderr": 0.02815637344037142 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.6454248366013072, "acc_stderr": 0.0193533605475537, "acc_norm": 0.6454248366013072, "acc_norm_stderr": 0.0193533605475537 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.6727272727272727, "acc_stderr": 0.0449429086625209, "acc_norm": 0.6727272727272727, "acc_norm_stderr": 0.0449429086625209 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7306122448979592, "acc_stderr": 0.02840125202902294, "acc_norm": 0.7306122448979592, "acc_norm_stderr": 0.02840125202902294 }, "harness|hendrycksTest-sociology|5": { "acc": 0.8159203980099502, "acc_stderr": 0.027403859410786845, "acc_norm": 0.8159203980099502, "acc_norm_stderr": 0.027403859410786845 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.88, "acc_stderr": 0.03265986323710906, "acc_norm": 0.88, "acc_norm_stderr": 0.03265986323710906 }, "harness|hendrycksTest-virology|5": { "acc": 0.5301204819277109, "acc_stderr": 0.03885425420866767, "acc_norm": 0.5301204819277109, "acc_norm_stderr": 0.03885425420866767 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8538011695906432, "acc_stderr": 0.02709729011807081, "acc_norm": 0.8538011695906432, "acc_norm_stderr": 0.02709729011807081 }, "harness|truthfulqa:mc|0": { "mc1": 0.4283965728274174, "mc1_stderr": 0.017323088597314757, "mc2": 0.5998074537794252, "mc2_stderr": 0.015494960379071198 }, "harness|winogrande|5": { "acc": 0.7853196527229677, "acc_stderr": 0.011539912734345403 }, "harness|gsm8k|5": { "acc": 0.6156178923426838, "acc_stderr": 0.013399219253698186 } } '''