ramy2018 commited on
Commit
fcd494c
·
verified ·
1 Parent(s): 0ccaff3

Upload 6 files

Browse files
Files changed (6) hide show
  1. README.md +6 -7
  2. app.py +53 -0
  3. gitattributes +35 -0
  4. rag_pipeline.py +34 -0
  5. requirements.txt +14 -0
  6. utils.py +31 -0
README.md CHANGED
@@ -1,14 +1,13 @@
1
  ---
2
- title: Pope30
3
- emoji: 🌖
4
- colorFrom: gray
5
- colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.31.0
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
- short_description: ask and answer
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
  ---
2
+ title: Arabic RAG Assistant
3
+ emoji: 📚
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.31.0
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
+ # Arabic RAG Assistant
13
+ مساعد بحث عربي يعتمد على الذكاء الاصطناعي المفتوح المصدر للإجابة على الأسئلة بناءً على مستندات PDF / DOCX / TXT المرفوعة.
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from rag_pipeline import RAGPipeline
3
+ from utils import process_documents
4
+ import time
5
+
6
+ rag = RAGPipeline()
7
+
8
+ def log_message(msg, logs):
9
+ logs = logs + msg + "\n"
10
+ return logs
11
+
12
+ def upload_and_index(files, logs):
13
+ logs = log_message("[RAG] بدء معالجة الملفات...", logs)
14
+ all_chunks = []
15
+ for file in files:
16
+ logs = log_message(f"[RAG] معالجة الملف: {file.name}", logs)
17
+ chunks = process_documents(file.name)
18
+ all_chunks.extend(chunks)
19
+ logs = log_message(f"[RAG] تم استخراج {len(chunks)} مقطع من {file.name}", logs)
20
+
21
+ logs = log_message(f"[RAG] بناء الفهرس لـ {len(all_chunks)} مقطع...", logs)
22
+ start = time.time()
23
+ rag.build_index(all_chunks)
24
+ duration = time.time() - start
25
+ logs = log_message(f"[RAG] تم بناء الفهرس في {duration:.2f} ثانية.", logs)
26
+ return logs, gr.update(visible=True), gr.update(visible=True)
27
+
28
+ def answer_question(question, logs):
29
+ logs = log_message(f"[RAG] استلام السؤال: {question}", logs)
30
+ start = time.time()
31
+ answer, sources = rag.answer(question)
32
+ duration = time.time() - start
33
+ logs = log_message(f"[RAG] تم الإجابة في {duration:.2f} ثانية.", logs)
34
+ logs = log_message(f"[RAG] المصادر: {sources}", logs)
35
+ return answer, logs
36
+
37
+ with gr.Blocks() as demo:
38
+ logs = gr.State("")
39
+ gr.Markdown("# نظام استرجاع المعرفة (RAG)")
40
+
41
+ with gr.Row():
42
+ files_input = gr.File(file_types=['.pdf', '.docx', '.txt'], file_count="multiple", label="رفع الملفات")
43
+ upload_btn = gr.Button("رفع وبناء الفهرس")
44
+
45
+ logs_output = gr.Textbox(label="سجل العمليات", lines=12, interactive=False, value="")
46
+ question_input = gr.Textbox(label="اكتب سؤالك هنا", visible=False)
47
+ ask_btn = gr.Button("إرسال السؤال", visible=False)
48
+ answer_output = gr.Textbox(label="الإجابة", lines=5)
49
+
50
+ upload_btn.click(upload_and_index, inputs=[files_input, logs], outputs=[logs_output, question_input, ask_btn])
51
+ ask_btn.click(answer_question, inputs=[question_input, logs], outputs=[answer_output, logs_output])
52
+
53
+ demo.launch()
gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
rag_pipeline.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+ from sentence_transformers import SentenceTransformer
3
+ import numpy as np
4
+ import time
5
+
6
+ class RAGPipeline:
7
+ def __init__(self):
8
+ print("[RAG] جاري تحميل النموذج والمحول...")
9
+ self.tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")
10
+ self.model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large")
11
+ self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
12
+ self.index = None
13
+ self.chunks = []
14
+ self.chunk_embeddings = []
15
+ print("[RAG] تم التحميل بنجاح.")
16
+
17
+ def build_index(self, chunks, logs=None):
18
+ self.chunks = chunks
19
+ self.chunk_embeddings = self.embedder.encode(chunks, convert_to_numpy=True)
20
+ if logs is not None:
21
+ logs.append(f"[RAG] تم بناء الفهرس بأبعاد {self.chunk_embeddings.shape}")
22
+ self.index = np.array(self.chunk_embeddings)
23
+
24
+ def answer(self, question):
25
+ question_embedding = self.embedder.encode([question], convert_to_numpy=True)
26
+ # بحث عن أقرب 5 مقاطع
27
+ similarities = np.dot(self.index, question_embedding.T).squeeze()
28
+ top_idx = similarities.argsort()[-5:][::-1]
29
+ context = "\n".join([self.chunks[i] for i in top_idx])
30
+ inputs = self.tokenizer.encode(question + " " + context, return_tensors="pt", max_length=512, truncation=True)
31
+ outputs = self.model.generate(inputs, max_length=200)
32
+ answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
33
+ sources = [self.chunks[i] for i in top_idx]
34
+ return answer, sources
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==4.25.0
2
+ transformers==4.40.1
3
+ sentence-transformers
4
+ langchain==0.1.20
5
+ chromadb==0.4.24
6
+ PyPDF2
7
+ python-docx
8
+ gradio
9
+ transformers
10
+ sentence-transformers
11
+ PyPDF2
12
+ python-docx
13
+ numpy
14
+ torch
utils.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from PyPDF2 import PdfReader
3
+ from docx import Document
4
+
5
+ def process_pdf(file_path):
6
+ reader = PdfReader(file_path)
7
+ text = ""
8
+ for page in reader.pages:
9
+ text += page.extract_text() + "\n"
10
+ return text.split('\n\n') # تقسيم النص إلى فقرات
11
+
12
+ def process_docx(file_path):
13
+ doc = Document(file_path)
14
+ paragraphs = [p.text for p in doc.paragraphs if p.text.strip() != ""]
15
+ return paragraphs
16
+
17
+ def process_txt(file_path):
18
+ with open(file_path, 'r', encoding='utf-8') as f:
19
+ text = f.read()
20
+ return text.split('\n\n')
21
+
22
+ def process_documents(file_path):
23
+ ext = os.path.splitext(file_path)[1].lower()
24
+ if ext == '.pdf':
25
+ return process_pdf(file_path)
26
+ elif ext == '.docx':
27
+ return process_docx(file_path)
28
+ elif ext == '.txt':
29
+ return process_txt(file_path)
30
+ else:
31
+ return []