Spaces:

taha092
/

HumanizerV2

Runtime error

App Files Files Community

taha092 commited on Jul 22

Commit

31d085e

verified ·

1 Parent(s): fd8896a

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -46

app.py CHANGED Viewed

@@ -9,13 +9,17 @@ import random
 import re
 # ----------------------
-# Paraphrasing Model Setup (Pegasus)
 # ----------------------
-PARAPHRASE_MODEL_NAME = "tuner007/pegasus_paraphrase"
-paraphrase_tokenizer = AutoTokenizer.from_pretrained(PARAPHRASE_MODEL_NAME)
-paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained(PARAPHRASE_MODEL_NAME)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-paraphrase_model = paraphrase_model.to(device)
 # ----------------------
 # Semantic Similarity Model
@@ -31,7 +35,7 @@ ai_detector = pipeline("text-classification", model=AI_DETECTOR_MODEL, device=0
 # ----------------------
 # Prompt Variations for Humanization
 # ----------------------
-PROMPT_VARIANTS = [
     "Paraphrase this naturally:",
     "Rewrite as if explaining to a friend:",
     "Make this sound like a real conversation:",
@@ -41,6 +45,12 @@ PROMPT_VARIANTS = [
     "Rewrite in a friendly, informal tone:",
     "Paraphrase in a way a student would say it:",
 ]
 # ----------------------
 # Sentence Splitter
@@ -50,7 +60,7 @@ def split_sentences(text):
     return [s for s in sentences if s]
 # ----------------------
-# Light Post-Processing
 # ----------------------
 def postprocess_text(text):
     contractions = {
@@ -64,40 +74,99 @@ def postprocess_text(text):
         "at the end of the day", "to be honest", "as a matter of fact", "for what it's worth",
         "in a nutshell", "the bottom line is", "all things considered"
     ]
     if random.random() < 0.3:
         text += " " + random.choice(idioms) + "."
     return text
 # ----------------------
-# Sentence-level Paraphrasing with Prompt Variation
 # ----------------------
-def paraphrase_sentence(sentence, tone):
-    prompt = random.choice(PROMPT_VARIANTS)
-    if tone != "Stealth":
-        prompt = f"{prompt} ({tone} tone):"
     full_prompt = f"{prompt} {sentence}"
-    batch = paraphrase_tokenizer([full_prompt], truncation=True, padding='longest', max_length=60, return_tensors="pt").to(device)
-    outputs = paraphrase_model.generate(
         **batch,
         max_length=60,
         num_beams=5,
         num_return_sequences=1,
         temperature=1.0
     )
-    tgt_text = paraphrase_tokenizer.batch_decode(outputs, skip_special_tokens=True)
     return tgt_text[0] if tgt_text else sentence
 # ----------------------
-# Main Paraphrasing Function
 # ----------------------
-def paraphrase(text, tone):
     sentences = split_sentences(text)
     paraphrased = []
     for sent in sentences:
-        rewritten = paraphrase_sentence(sent, tone)
-        paraphrased.append(rewritten)
     joined = ' '.join(paraphrased)
-    return postprocess_text(joined)
 # ----------------------
 # Semantic Similarity Function
@@ -108,22 +177,6 @@ def semantic_similarity(text1, text2):
     sim = util.pytorch_cos_sim(emb1, emb2).item()
     return sim
-# ----------------------
-# Local AI Detection Function
-# ----------------------
-def check_ai_score(text):
-    try:
-        result = ai_detector(text)
-        for r in result:
-            # LABEL_1 = AI, LABEL_0 = Human
-            if r['label'] in ['LABEL_1', 'Fake']:
-                return r['score'], None
-            elif r['label'] in ['LABEL_0', 'Real']:
-                return 1.0 - r['score'], None
-        return 0.5, None  # fallback
-    except Exception as e:
-        return None, f"AI detection error: {str(e)}"
 # ----------------------
 # Humanization Score & Rating
 # ----------------------
@@ -149,22 +202,35 @@ def process(text, tone):
     if pre_ai_prob is None:
         return "", f"AI Detection Error: {pre_err}", 0.0, "", 0.0, ""
     try:
-        paraphrased = paraphrase(text, tone)
     except Exception as e:
         return f"[Paraphrasing error: {str(e)}]", "", 0.0, "", 0.0, ""
-    post_ai_prob, post_err = check_ai_score(paraphrased)
-    if post_ai_prob is None:
-        return paraphrased, f"AI Detection Error: {post_err}", 0.0, "", 0.0, ""
-    sim = semantic_similarity(text, paraphrased)
-    score = humanization_score(sim, post_ai_prob)
-    rating = humanization_rating(score)
-    ai_score_str = f"Pre: {100*(1-pre_ai_prob):.1f}% human | Post: {100*(1-post_ai_prob):.1f}% human"
     return (
-        paraphrased,
         ai_score_str,
         sim,
         rating,
-        score * 100,
         ""
     )

 import re
 # ----------------------
+# Paraphrasing Model Setup (Pegasus + T5)
 # ----------------------
+PEGASUS_MODEL_NAME = "tuner007/pegasus_paraphrase"
+T5_MODEL_NAME = "Vamsi/T5_Paraphrase_Paws"
+pegasus_tokenizer = AutoTokenizer.from_pretrained(PEGASUS_MODEL_NAME)
+pegasus_model = AutoModelForSeq2SeqLM.from_pretrained(PEGASUS_MODEL_NAME)
+t5_tokenizer = AutoTokenizer.from_pretrained(T5_MODEL_NAME)
+t5_model = AutoModelForSeq2SeqLM.from_pretrained(T5_MODEL_NAME)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+pegasus_model = pegasus_model.to(device)
+t5_model = t5_model.to(device)
 # ----------------------
 # Semantic Similarity Model
 # ----------------------
 # Prompt Variations for Humanization
 # ----------------------
+PEGASUS_PROMPTS = [
     "Paraphrase this naturally:",
     "Rewrite as if explaining to a friend:",
     "Make this sound like a real conversation:",
     "Rewrite in a friendly, informal tone:",
     "Paraphrase in a way a student would say it:",
 ]
+T5_PROMPTS = [
+    "Paraphrase the following text in a formal, academic tone:",
+    "Paraphrase the following text in a casual, conversational tone:",
+    "Paraphrase the following text in a friendly, approachable tone:",
+    "Paraphrase the following text to bypass AI detectors and sound as human as possible:",
+]
 # ----------------------
 # Sentence Splitter
     return [s for s in sentences if s]
 # ----------------------
+# Aggressive Post-Processing
 # ----------------------
 def postprocess_text(text):
     contractions = {
         "at the end of the day", "to be honest", "as a matter of fact", "for what it's worth",
         "in a nutshell", "the bottom line is", "all things considered"
     ]
+    transitions = [
+        "Interestingly,", "In fact,", "To be clear,", "As a result,", "For example,", "On the other hand,", "In other words,"
+    ]
     if random.random() < 0.3:
         text += " " + random.choice(idioms) + "."
+    if random.random() < 0.3:
+        text = random.choice(transitions) + " " + text
+    # Randomly lower-case a word to mimic human error
+    if random.random() < 0.2:
+        words = text.split()
+        if len(words) > 3:
+            idx = random.randint(1, len(words)-2)
+            words[idx] = words[idx].lower()
+            text = ' '.join(words)
     return text
 # ----------------------
+# Multi-Model, Multi-Pass Paraphrasing
 # ----------------------
+def pegasus_paraphrase(sentence):
+    prompt = random.choice(PEGASUS_PROMPTS)
     full_prompt = f"{prompt} {sentence}"
+    batch = pegasus_tokenizer([full_prompt], truncation=True, padding='longest', max_length=60, return_tensors="pt").to(device)
+    outputs = pegasus_model.generate(
         **batch,
         max_length=60,
         num_beams=5,
         num_return_sequences=1,
         temperature=1.0
     )
+    tgt_text = pegasus_tokenizer.batch_decode(outputs, skip_special_tokens=True)
     return tgt_text[0] if tgt_text else sentence
+def t5_paraphrase(sentence):
+    prompt = random.choice(T5_PROMPTS) + " " + sentence
+    input_ids = t5_tokenizer.encode(prompt, return_tensors="pt", max_length=256, truncation=True).to(device)
+    outputs = t5_model.generate(
+        input_ids,
+        do_sample=True,
+        top_k=120,
+        top_p=0.95,
+        temperature=0.7,
+        repetition_penalty=1.2,
+        max_length=256,
+        num_return_sequences=1
+    )
+    paraphrased = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return paraphrased
 # ----------------------
+# Feedback Loop with AI Detector
 # ----------------------
+def check_ai_score(text):
+    try:
+        result = ai_detector(text)
+        for r in result:
+            if r['label'] in ['LABEL_1', 'Fake']:
+                return r['score'], None
+            elif r['label'] in ['LABEL_0', 'Real']:
+                return 1.0 - r['score'], None
+        return 0.5, None
+    except Exception as e:
+        return None, f"AI detection error: {str(e)}"
+# ----------------------
+# Main Humanizer Pipeline
+# ----------------------
+def humanize_pipeline(text, tone, max_feedback_loops=2):
     sentences = split_sentences(text)
     paraphrased = []
     for sent in sentences:
+        # First pass: Pegasus
+        peg = pegasus_paraphrase(sent)
+        # Second pass: T5
+        t5 = t5_paraphrase(peg)
+        paraphrased.append(t5)
     joined = ' '.join(paraphrased)
+    processed = postprocess_text(joined)
+    # Feedback loop: if still flagged as AI, re-paraphrase flagged sentences
+    for _ in range(max_feedback_loops):
+        ai_prob, _ = check_ai_score(processed)
+        if ai_prob is not None and ai_prob < 0.5:
+            break  # Considered human
+        # Re-paraphrase all sentences again
+        sentences = split_sentences(processed)
+        paraphrased = []
+        for sent in sentences:
+            peg = pegasus_paraphrase(sent)
+            t5 = t5_paraphrase(peg)
+            paraphrased.append(t5)
+        joined = ' '.join(paraphrased)
+        processed = postprocess_text(joined)
+    return processed
 # ----------------------
 # Semantic Similarity Function
     sim = util.pytorch_cos_sim(emb1, emb2).item()
     return sim
 # ----------------------
 # Humanization Score & Rating
 # ----------------------
     if pre_ai_prob is None:
         return "", f"AI Detection Error: {pre_err}", 0.0, "", 0.0, ""
     try:
+        # Generate 3 versions for user choice
+        outputs = [humanize_pipeline(text, tone) for _ in range(3)]
     except Exception as e:
         return f"[Paraphrasing error: {str(e)}]", "", 0.0, "", 0.0, ""
+    # Pick the most human-like version (lowest ai_prob)
+    best = None
+    best_score = -1
+    best_ai_prob = 1.0
+    for out in outputs:
+        post_ai_prob, _ = check_ai_score(out)
+        sim = semantic_similarity(text, out)
+        score = humanization_score(sim, post_ai_prob if post_ai_prob is not None else 1.0)
+        if post_ai_prob is not None and post_ai_prob < best_ai_prob:
+            best = out
+            best_score = score
+            best_ai_prob = post_ai_prob
+    if best is None:
+        best = outputs[0]
+        best_score = 0.0
+        best_ai_prob = 1.0
+    sim = semantic_similarity(text, best)
+    rating = humanization_rating(best_score)
+    ai_score_str = f"Pre: {100*(1-pre_ai_prob):.1f}% human | Post: {100*(1-best_ai_prob):.1f}% human"
     return (
+        best,
         ai_score_str,
         sim,
         rating,
+        best_score * 100,
         ""
     )