chenxwh
/

AVeriTeC

Model card Files Files and versions

xet

Community

chenxwh commited on Jul 15, 2024

Commit

a106f67

verified ·

1 Parent(s): caff402

Upload veracity_with_scraped_text.py

Browse files

Files changed (1) hide show

src/prediction/veracity_with_scraped_text.py +68 -0

src/prediction/veracity_with_scraped_text.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os
+import argparse
+import json
+from tqdm import tqdm
+def load_url_text_map(knowledge_store_dir, claim_id):
+    url_text_map = {}
+    knowledge_file = os.path.join(knowledge_store_dir, f"{claim_id}.json")
+    if os.path.exists(knowledge_file):
+        with open(knowledge_file, "r") as f:
+            for line in f:
+                data = json.loads(line)
+                url = data["url"]
+                url2text = data["url2text"]
+                concatenated_text = " ".join(url2text)
+                url_text_map[url] = concatenated_text
+    return url_text_map
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Add scraped_text field to the prediction file."
+    )
+    parser.add_argument(
+        "-i",
+        "--veracity_prediction_file",
+        default="data_store/dev_veracity_prediction.json",
+        help="Json file with the veracity predictions.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output_file",
+        default="data_store/dev_veracity_prediction_for_submission.json",
+        help="Json file with the veracity predictions and the scraped_text.",
+    )
+    parser.add_argument(
+        "--knowledge_store_dir",
+        type=str,
+        help="Directory of json files of the knowledge store containing url2text.",
+    )
+    args = parser.parse_args()
+    predictions = []
+    with open(args.veracity_prediction_file) as f:
+        predictions = json.load(f)
+    for claim in tqdm(predictions, desc="Processing claims"):
+        claim_id = claim["claim_id"]
+        url_text_map = load_url_text_map(args.knowledge_store_dir, claim_id)
+        # Process each evidence in the claim and add scraped_text
+        for evidence in claim["evidence"]:
+            url = evidence["url"]
+            scraped_text = url_text_map.get(url)
+            if scraped_text:
+                evidence["scraped_text"] = scraped_text
+            else:
+                print(
+                    f"Warning: No scraped text found for claim_id {claim_id} and url {url}"
+                )
+    with open(args.output_file, "w", encoding="utf-8") as output_file:
+        json.dump(predictions, output_file, ensure_ascii=False, indent=4)
+    print(f"Updated JSON saved to {args.output_file}")