{"task_name": "arc_challenge::olmes", "task_hash": "11d27cc9476c8b7bf020c4361973aaa5", "model_hash": "91bf664a9e1a7082cab09f7cd7b429f6", "model_config": {"model": "allenai/OLMo-1B-hf", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf", "metadata": {"alias": "olmo-1b"}}, "task_config": {"task_name": "arc_challenge::olmes", "task_core": "arc_challenge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"extra_metric_names": ["no_answer"]}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": false, "external_eval": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "arc_challenge::olmes"}}, "compute_config": {"batch_size": 1, "max_batch_size": 32, "output_dir": "olmo-1b-arc", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false}, "processing_time": 10.335389375686646, "current_date": "2025-05-12 03:01:31 UTC", "num_instances": 2344, "beaker_info": {}, "metrics": {"primary_score": 0.3856655290102389}, "task_idx": null} {"task_name": "arc_challenge:mc", "task_hash": "867052288d273ab0fe8a12a6c5c548e6", "model_hash": "91bf664a9e1a7082cab09f7cd7b429f6", "model_config": {"model": "allenai/OLMo-1B-hf", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf", "metadata": {"alias": "olmo-1b"}}, "task_config": {"task_name": "arc_challenge:mc", "task_core": "arc_challenge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"extra_metric_names": ["no_answer"]}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": false, "external_eval": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:mc::olmes"}}, "compute_config": {"batch_size": 1, "max_batch_size": 32, "output_dir": "olmo-1b-arc", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false}, "processing_time": 8.28089714050293, "current_date": "2025-05-12 03:01:31 UTC", "num_instances": 1172, "beaker_info": {}, "metrics": {"acc_raw": 0.25341296928327645, "acc_per_token": 0.25341296928327645, "acc_per_char": 0.25341296928327645, "acc_per_byte": 0.25341296928327645, "sum_logits_corr": -1.4381909157330672, "logits_per_token_corr": -1.4381909157330672, "logits_per_char_corr": -0.7190954578665336, "bits_per_byte_corr": 1.0374354509905723, "primary_score": 0.25341296928327645, "extra_metrics": {"no_answer": 0.0}}, "task_idx": 0} {"task_name": "arc_challenge", "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "91bf664a9e1a7082cab09f7cd7b429f6", "model_config": {"model": "allenai/OLMo-1B-hf", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf", "metadata": {"alias": "olmo-1b"}}, "task_config": {"task_name": "arc_challenge", "task_core": "arc_challenge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_uncond", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000, "extra_metric_names": ["no_answer"]}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": false, "external_eval": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:rc::olmes"}}, "compute_config": {"batch_size": 1, "max_batch_size": 32, "output_dir": "olmo-1b-arc", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false}, "processing_time": 2.054492235183716, "current_date": "2025-05-12 03:02:16 UTC", "num_instances": 1172, "beaker_info": {}, "metrics": {"acc_raw": 0.3293515358361775, "acc_per_token": 0.34726962457337884, "acc_per_char": 0.3438566552901024, "acc_per_byte": 0.3438566552901024, "sum_logits_corr": -15.75188249934452, "logits_per_token_corr": -2.916146519577311, "logits_per_char_corr": -0.5858619454662958, "bits_per_byte_corr": 0.8440443257845128, "acc_uncond": 0.3856655290102389, "primary_score": 0.3856655290102389, "extra_metrics": {"no_answer": 0.0}}, "task_idx": 1}