Spaces:

AIML-TUDA
/

VerifiableRewardsForScalableLogicalReasoning

Running

App Files Files Community

Lukas Helff commited on 6 days ago

Commit

58596cd

1 Parent(s): 03d8e50

extract rule from NL

Browse files

Files changed (2) hide show

VerifiableRewardsForScalableLogicalReasoning.py +50 -12
app.py +1 -1

VerifiableRewardsForScalableLogicalReasoning.py CHANGED Viewed

@@ -100,7 +100,7 @@ Returns:
 """
-def _evaluate_with_prolog(rule_to_evaluate, validation_program, eval_config, timeout=5):
     """
     Evaluates a predicted rule against the validation program using Prolog.
     """
@@ -108,6 +108,7 @@ def _evaluate_with_prolog(rule_to_evaluate, validation_program, eval_config, tim
     positive_pred = eval_config.get("positive_predicate", "eastbound")
     negative_pred = eval_config.get("negative_predicate", "westbound")
     # extract predicate from rule_to_evaluate
     if positive_pred not in rule_to_evaluate:
         logger.warning(f"Rule '{rule_to_evaluate}' does not contain positive predicate '{positive_pred}'")
         return {
@@ -137,8 +138,11 @@ check({vars}) :- neg({vars}), \\+ {positive_pred}({vars}).  % negative rejected
 % Count successful checks
 check_count(Count) :-
-    setof(({vars}), ((pos({vars}); neg({vars})), check({vars})), CorrectExamples),
-    length(CorrectExamples, Count).
 check_all :- forall((pos({vars});neg({vars})), check({vars})).
     """
@@ -165,12 +169,13 @@ check_all :- forall((pos({vars});neg({vars})), check({vars})).
             timeout=timeout,
             text=True
         )
         # Extract partial score from output
-        partial_score = int(result.stdout.strip()) / pos_negs if pos_negs > 0 else 0.0
         is_correct = True if partial_score == 1.0 else False
-        error = result.stderr if result.stderr else None
         t1 = time.time()
         return {
@@ -186,13 +191,50 @@ check_all :- forall((pos({vars});neg({vars})), check({vars})).
         return {"is_correct": False, "partial_score": 0.0, "syntax_valid": False,
                 "error": f"Evaluation timed out after {timeout} seconds"}
     except Exception as e:
-        logger.warning(f"Error evaluating rule '{rule_to_evaluate}' returns: '{result.stdout.strip() if result else 'No error message'}'")
-        error_message = 'Invalid Syntax: exit with ' + str(e)
-        return {"is_correct": False, "partial_score": 0.0, "syntax_valid": False, "error": error_message}
     finally:
         if os.path.exists(temp_file):
             os.remove(temp_file)
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
@@ -270,10 +312,6 @@ class VerifiableRewardsForScalableLogicalReasoning(evaluate.Metric):
             if not validation_program:
                 raise ValueError(f"Example {i} does not contain validation program field")
-            # Make sure the prediction is a proper rule format
-            if not prediction.strip().endswith('.'):
-                prediction = prediction.strip() + '.'
             eval_inputs.append((prediction, validation_program, eval_config))
         # Process evaluations in parallel

 """
+def _evaluate_with_prolog(prediction, validation_program, eval_config, timeout=5):
     """
     Evaluates a predicted rule against the validation program using Prolog.
     """
     positive_pred = eval_config.get("positive_predicate", "eastbound")
     negative_pred = eval_config.get("negative_predicate", "westbound")
     # extract predicate from rule_to_evaluate
+    rule_to_evaluate = extract_ilp_from_text_v2(prediction)
     if positive_pred not in rule_to_evaluate:
         logger.warning(f"Rule '{rule_to_evaluate}' does not contain positive predicate '{positive_pred}'")
         return {
 % Count successful checks
 check_count(Count) :-
+    (setof(({vars}), ((pos({vars}); neg({vars})), check({vars})), CorrectExamples) ->
+        length(CorrectExamples, Count)
+    ;
+        Count = 0
+    ).
 check_all :- forall((pos({vars});neg({vars})), check({vars})).
     """
             timeout=timeout,
             text=True
         )
+        partial_score = 0.0 if result.stdout.strip() == '' else int(result.stdout.strip())
         # Extract partial score from output
+        partial_score = partial_score / pos_negs if pos_negs > 0 else 0.0
         is_correct = True if partial_score == 1.0 else False
+        error = f'Rule invalid "{rule_to_evaluate}" with' + result.stderr if result.stderr else None
         t1 = time.time()
         return {
         return {"is_correct": False, "partial_score": 0.0, "syntax_valid": False,
                 "error": f"Evaluation timed out after {timeout} seconds"}
     except Exception as e:
+        logger.warning(f"Error evaluating rule '{rule_to_evaluate}' returns: '{result.stdout.strip() if result else 'No error message'}' with error: {e}")
+        return {"is_correct": False, "partial_score": 0.0, "syntax_valid": False,
+                "error": f"Syntactically invalid rule '{rule_to_evaluate}'"}
     finally:
         if os.path.exists(temp_file):
             os.remove(temp_file)
+def extract_ilp_from_text(text):
+    rule_patterns = [
+        # Pattern with body (full rule with implication)
+        r'([a-zA-Z_][a-zA-Z0-9_]*\([^)]*\)\s*:-[^.]*\.)',
+        # Pattern for facts (no body)
+        # r'([a-zA-Z_][a-zA-Z0-9_]*\([^)]*\)\s*\.)'
+    ]
+    p_code = ''
+    for pattern in rule_patterns:
+        matches = re.findall(pattern, text)
+        for match in matches:
+            # Ensure the rule ends with a period
+            statement = match.strip()
+            if not statement.endswith('.'):
+                statement += '.'
+            p_code += statement + '\n'
+    return p_code
+def extract_ilp_from_text_v2(text, target_predicates=None):
+    # Pre-process: collapse code blocks to single lines
+    text = re.sub(r'\n\s*', ' ', text)  # crude: flatten all to one line
+    # Optionally restrict to only some predicates
+    preds = '|'.join([re.escape(p) for p in (target_predicates or [])])
+    head_pat = rf"(?:{preds})" if preds else r"[a-zA-Z_][a-zA-Z0-9_]*"
+    # Rule pattern, across newlines
+    rule_pattern = re.compile(rf'({head_pat}\([^()]*\)\s*:-.*?\.)')
+    rules = set(rule_pattern.findall(text))
+    # Remove rules that are also captured as facts
+    p_code = ''
+    for rule in rules:
+        # Ensure the rule ends with a period
+        statement = rule.strip()
+        if not statement.endswith('.'):
+            statement += '.'
+        p_code += statement + '\n'
+    return p_code.strip()  # Ensure no trailing whitespace
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
             if not validation_program:
                 raise ValueError(f"Example {i} does not contain validation program field")
             eval_inputs.append((prediction, validation_program, eval_config))
         # Process evaluations in parallel

app.py CHANGED Viewed

@@ -269,5 +269,5 @@ Evaluations performed by the symbolic judge are fully verifiable and grounded in
     return demo
 # Use a local path instead of a module name
-module = evaluate.load("AIML-TUDA/VerifiableRewardsForScalableLogicalReasoning")
 create_interface(module).launch()

     return demo
 # Use a local path instead of a module name
+module = evaluate.load("./VerifiableRewardsForScalableLogicalReasoning.py")
 create_interface(module).launch()