|
|
|
|
|
import random |
|
import requests |
|
|
|
from datasets import load_dataset, Dataset, DatasetDict |
|
|
|
|
|
path = "pminervini/HaluEval" |
|
|
|
API_URL = f"https://datasets-server.huggingface.co/splits?dataset={path}" |
|
response = requests.get(API_URL) |
|
res_json = response.json() |
|
|
|
gold_splits = {"dialogue", "qa", "summarization", "general"} |
|
|
|
available_splits = {split["config"] for split in res_json["splits"]} if "splits" in res_json else set() |
|
|
|
name_to_ds = dict() |
|
|
|
for name in gold_splits: |
|
ds = load_dataset("json", data_files={"data": f"data/{name}_data.json"}) |
|
name_to_ds[name] = ds |
|
|
|
ds.push_to_hub(path, config_name=name) |
|
|
|
|
|
def list_to_dict(lst: list) -> dict: |
|
res = dict() |
|
for entry in lst: |
|
for k, v in entry.items(): |
|
if k not in res: |
|
res[k] = [] |
|
res[k] += [v] |
|
return res |
|
|
|
|
|
for name in gold_splits - {"general"}: |
|
random.seed(42) |
|
ds = name_to_ds[name] |
|
new_entry_lst = [] |
|
|
|
for entry in ds["data"]: |
|
is_hallucinated = random.random() > 0.5 |
|
new_entry = None |
|
if name in {"qa"}: |
|
new_entry = { |
|
"knowledge": entry["knowledge"], |
|
"question": entry["question"], |
|
"answer": entry[f'{"hallucinated" if is_hallucinated else "right"}_answer'], |
|
"hallucination": "yes" if is_hallucinated else "no", |
|
} |
|
if name in {"dialogue"}: |
|
new_entry = { |
|
"knowledge": entry["knowledge"], |
|
"dialogue_history": entry["dialogue_history"], |
|
"response": entry[f'{"hallucinated" if is_hallucinated else "right"}_response'], |
|
"hallucination": "yes" if is_hallucinated else "no", |
|
} |
|
if name in {"summarization"}: |
|
new_entry = { |
|
"document": entry["document"], |
|
"summary": entry[f'{"hallucinated" if is_hallucinated else "right"}_summary'], |
|
"hallucination": "yes" if is_hallucinated else "no", |
|
} |
|
assert new_entry is not None |
|
new_entry_lst += [new_entry] |
|
new_ds_map = list_to_dict(new_entry_lst) |
|
new_ds = Dataset.from_dict(new_ds_map) |
|
new_dsd = DatasetDict({"data": new_ds}) |
|
|
|
new_dsd.push_to_hub(path, config_name=f"{name}_samples") |
|
|