Introduction
This is the official repo of the paper Annotation-Efficient Universal Honesty Alignment
This repository provides modules that extend Qwen2.5-7B-Instruct with the ability to generate accurate confidence scores before response generation, indicating how likely the model is to answer a given question correctly across tasks. We offer two types of modules—LoRA + Linear Head and Linear Head—along with model parameters under three training settings:
- Elicitation (greedy): Trained on all questions (over 560k) using self-consistency-based confidence annotations.
- Calibration-Only (right): Trained on questions with explicit correctness annotations.
- EliCal (hybrid): Initialized from the Elicitation model and further trained on correctness-labeled data.
For both Calibration-Only and EliCal settings, we provide models trained with different amounts of annotated data (1k, 2k, 3k, 5k, 8k, 10k, 20k, 30k, 50k, 80k, 200k, 560k+). Since LoRA + Linear Head is the main configuration used in our paper, the following description is based on this setup.
In our model, LoRA is applied to all linear layers with r = 8 and α = 16. The Linear Head is added to the final layer of the model and takes as input the internal state of the last token from the final layer. It predicts a confidence score between 0 and 1, representing the model’s estimated probability of answering the question correctly.
Model Architecture
class LMWithVectorHead(nn.Module):
def __init__(self, model_name, lora_config, output_dim=1):
super().__init__()
backbone = AutoModel.from_pretrained(model_name, device_map='cpu')
# backbone.config.use_cache = False
self.peft_model = get_peft_model(backbone, lora_config)
self.config = backbone.config
hidden_size = backbone.config.hidden_size
self.vector_head = nn.Linear(hidden_size, output_dim) # 输出维度为 1
def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
"""启用梯度检查点,并处理可能的额外参数"""
self.peft_model.enable_input_require_grads()
if gradient_checkpointing_kwargs is not None:
self.peft_model.gradient_checkpointing_enable(**gradient_checkpointing_kwargs)
else:
self.peft_model.gradient_checkpointing_enable()
def forward(self, input_ids, attention_mask=None, labels=None):
# if hasattr(self.peft_model, "gradient_checkpointing"):
# print(f"✅ 梯度检查点已启用 - 当前模式: {self.peft_model.is_gradient_checkpointing}")
# else:
# print("❌ 梯度检查点未正确初始化")
outputs = self.peft_model(
input_ids=input_ids,
attention_mask=attention_mask,
return_dict=True
)
# 获取最后一个 token 的隐藏状态
last_hidden = outputs.last_hidden_state # [B, T, H]
cls_hidden = last_hidden[:, -1, :] # [B, H]
logits = self.vector_head(cls_hidden) # [B, 1]
logits = torch.sigmoid(logits).squeeze(-1) # 添加 sigmoid 并压缩至 [B]
loss = None
if labels is not None:
loss_fct = nn.MSELoss() # 使用 MSE 损失
loss = loss_fct(logits, labels) # 计算 logits 和 labels 的 MSE
return CausalLMOutput(
loss=loss,
logits=logits
)
Inference
This shows how to load the model. For more details, please refer to Github Repo.
base_model = AutoModel.from_pretrained(args.model_path)
# 2. 加载训练好的LoRA适配器到基础模型上
peft_model = PeftModel.from_pretrained(
base_model, # 使用基础模型,而不是model.peft_model
args.lora_path,
adapter_name="default"
)
# 3. 创建完整模型结构
lora_config = LoraConfig(
r=args.r,
lora_alpha=args.alpha,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_dropout=args.lora_dropout,
bias="none",
)
model = LMWithVectorHead(args.model_path, lora_config)
# 4. 替换为已加载LoRA的模型
model.peft_model = peft_model
# 5. 加载分类头权重
state_dict = torch.load(args.vector_head_path, map_location=device)
model.vector_head.load_state_dict(state_dict)
# 6. 激活适配器并移动到设备
model.peft_model.set_adapter("default")
model = model.to(device)
# 评估模式
model.eval()
Files
/lora
├── greedy_answer_conf
│ └── long_qa
│ └── batchsize16_accumulation8_epochs10_weightdecay0.1_r8_alpha16_loradropout0.0 (training configuration)
│ ├── best_checkpoints
│ │ ├── lora_epoch_best/ # Path to LoRA module
│ │ └── vector_head_epoch_best.pt # Path to Linear Head weights
│ └── test_losses.json # Test loss for each epoch
│
├── hybrid_answer_conf
│ └── long_qa
│ ├── batchsize16_accumulation8_epochs10_weightdecay0.1_r8_alpha16_loradropout0.0 (560k samples)
│ ├── batchsize16_accumulation8_epochs50_weightdecay0.1_r8_alpha16_loradropout0.0_1k_training_samples (1k samples)
│ └── batchsize16_accumulation8_epochs50_weightdecay0.1_r8_alpha16_loradropout0.0_2k_training_samples (2k samples)
│
└── right_answer_conf
└── long_qa
└── ... # Same format as above
/mlp
...