File size: 2,793 Bytes
b6f8046
 
 
 
 
 
 
4238475
b6f8046
 
 
 
 
7d3c4f0
 
 
b6f8046
3a153f0
497c142
b6f8046
 
 
d854811
 
 
 
 
497c142
d854811
 
 
 
 
 
97f073d
7d3c4f0
 
97f073d
3a153f0
c04d662
7d3c4f0
 
 
c04d662
 
97f073d
 
9c2b1d4
 
1e1c187
80609fb
1e1c187
d854811
1e1c187
 
c04d662
 
1e1c187
 
d854811
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, models
import numpy as np
import torch

class RAGPipeline:
    def __init__(self):
        print("[RAG] تحميل النماذج...")

        word_embedding_model = models.Transformer('asafaya/bert-base-arabic')
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
        self.embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

        # ✅ نموذج مخصص للتلخيص العربي
        self.tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
        self.model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")

        self.chunks = []
        self.embeddings = None

        print("[RAG] تم تحميل النماذج بنجاح.")

    def build_index(self, chunks):
        self.chunks = chunks
        self.embeddings = self.embedder.encode(chunks, convert_to_numpy=True)

    def retrieve_passages(self, question, top_k=5):
        if self.embeddings is None or len(self.chunks) == 0:
            return []
        question_embedding = self.embedder.encode([question], convert_to_numpy=True)
        similarities = np.dot(self.embeddings, question_embedding.T).squeeze()
        top_indices = similarities.argsort()[-top_k:][::-1]
        return [self.chunks[i] for i in top_indices]

    def summarize_text(self, text):
        print("[RAG][INPUT TO SUMMARIZE]:", text)
        prompt = f"summarize: {text}"
        try:
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
            summary_ids = self.model.generate(inputs["input_ids"], max_length=128)
            summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()
            print(f"[RAG][DEBUG] الملخص الناتج:\n{summary}")
            return summary
        except Exception as e:
            print(f"[RAG][ERROR] أثناء التلخيص: {e}")
            return ""

    def generate_answer_from_passages(self, question, passages_text):
        summary = self.summarize_text(passages_text)

        prompt = f"أجب عن السؤال التالي بناء على النص:\n\n{summary}\n\nالسؤال: {question}\nالإجابة:"
        try:
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
            output_ids = self.model.generate(inputs["input_ids"], max_length=200)
            answer = self.tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
        except Exception as e:
            print(f"[RAG][ERROR] أثناء توليد الإجابة: {e}")
            answer = ""

        return answer, summary