Upload 6 files
Browse files- README.md +6 -7
- app.py +53 -0
- gitattributes +35 -0
- rag_pipeline.py +34 -0
- requirements.txt +14 -0
- utils.py +31 -0
README.md
CHANGED
@@ -1,14 +1,13 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.31.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
license: mit
|
11 |
-
short_description: ask and answer
|
12 |
---
|
13 |
|
14 |
-
|
|
|
|
1 |
---
|
2 |
+
title: Arabic RAG Assistant
|
3 |
+
emoji: 📚
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: purple
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.31.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
|
|
10 |
---
|
11 |
|
12 |
+
# Arabic RAG Assistant
|
13 |
+
مساعد بحث عربي يعتمد على الذكاء الاصطناعي المفتوح المصدر للإجابة على الأسئلة بناءً على مستندات PDF / DOCX / TXT المرفوعة.
|
app.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from rag_pipeline import RAGPipeline
|
3 |
+
from utils import process_documents
|
4 |
+
import time
|
5 |
+
|
6 |
+
rag = RAGPipeline()
|
7 |
+
|
8 |
+
def log_message(msg, logs):
|
9 |
+
logs = logs + msg + "\n"
|
10 |
+
return logs
|
11 |
+
|
12 |
+
def upload_and_index(files, logs):
|
13 |
+
logs = log_message("[RAG] بدء معالجة الملفات...", logs)
|
14 |
+
all_chunks = []
|
15 |
+
for file in files:
|
16 |
+
logs = log_message(f"[RAG] معالجة الملف: {file.name}", logs)
|
17 |
+
chunks = process_documents(file.name)
|
18 |
+
all_chunks.extend(chunks)
|
19 |
+
logs = log_message(f"[RAG] تم استخراج {len(chunks)} مقطع من {file.name}", logs)
|
20 |
+
|
21 |
+
logs = log_message(f"[RAG] بناء الفهرس لـ {len(all_chunks)} مقطع...", logs)
|
22 |
+
start = time.time()
|
23 |
+
rag.build_index(all_chunks)
|
24 |
+
duration = time.time() - start
|
25 |
+
logs = log_message(f"[RAG] تم بناء الفهرس في {duration:.2f} ثانية.", logs)
|
26 |
+
return logs, gr.update(visible=True), gr.update(visible=True)
|
27 |
+
|
28 |
+
def answer_question(question, logs):
|
29 |
+
logs = log_message(f"[RAG] استلام السؤال: {question}", logs)
|
30 |
+
start = time.time()
|
31 |
+
answer, sources = rag.answer(question)
|
32 |
+
duration = time.time() - start
|
33 |
+
logs = log_message(f"[RAG] تم الإجابة في {duration:.2f} ثانية.", logs)
|
34 |
+
logs = log_message(f"[RAG] المصادر: {sources}", logs)
|
35 |
+
return answer, logs
|
36 |
+
|
37 |
+
with gr.Blocks() as demo:
|
38 |
+
logs = gr.State("")
|
39 |
+
gr.Markdown("# نظام استرجاع المعرفة (RAG)")
|
40 |
+
|
41 |
+
with gr.Row():
|
42 |
+
files_input = gr.File(file_types=['.pdf', '.docx', '.txt'], file_count="multiple", label="رفع الملفات")
|
43 |
+
upload_btn = gr.Button("رفع وبناء الفهرس")
|
44 |
+
|
45 |
+
logs_output = gr.Textbox(label="سجل العمليات", lines=12, interactive=False, value="")
|
46 |
+
question_input = gr.Textbox(label="اكتب سؤالك هنا", visible=False)
|
47 |
+
ask_btn = gr.Button("إرسال السؤال", visible=False)
|
48 |
+
answer_output = gr.Textbox(label="الإجابة", lines=5)
|
49 |
+
|
50 |
+
upload_btn.click(upload_and_index, inputs=[files_input, logs], outputs=[logs_output, question_input, ask_btn])
|
51 |
+
ask_btn.click(answer_question, inputs=[question_input, logs], outputs=[answer_output, logs_output])
|
52 |
+
|
53 |
+
demo.launch()
|
gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
rag_pipeline.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
import numpy as np
|
4 |
+
import time
|
5 |
+
|
6 |
+
class RAGPipeline:
|
7 |
+
def __init__(self):
|
8 |
+
print("[RAG] جاري تحميل النموذج والمحول...")
|
9 |
+
self.tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")
|
10 |
+
self.model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large")
|
11 |
+
self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
12 |
+
self.index = None
|
13 |
+
self.chunks = []
|
14 |
+
self.chunk_embeddings = []
|
15 |
+
print("[RAG] تم التحميل بنجاح.")
|
16 |
+
|
17 |
+
def build_index(self, chunks, logs=None):
|
18 |
+
self.chunks = chunks
|
19 |
+
self.chunk_embeddings = self.embedder.encode(chunks, convert_to_numpy=True)
|
20 |
+
if logs is not None:
|
21 |
+
logs.append(f"[RAG] تم بناء الفهرس بأبعاد {self.chunk_embeddings.shape}")
|
22 |
+
self.index = np.array(self.chunk_embeddings)
|
23 |
+
|
24 |
+
def answer(self, question):
|
25 |
+
question_embedding = self.embedder.encode([question], convert_to_numpy=True)
|
26 |
+
# بحث عن أقرب 5 مقاطع
|
27 |
+
similarities = np.dot(self.index, question_embedding.T).squeeze()
|
28 |
+
top_idx = similarities.argsort()[-5:][::-1]
|
29 |
+
context = "\n".join([self.chunks[i] for i in top_idx])
|
30 |
+
inputs = self.tokenizer.encode(question + " " + context, return_tensors="pt", max_length=512, truncation=True)
|
31 |
+
outputs = self.model.generate(inputs, max_length=200)
|
32 |
+
answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
33 |
+
sources = [self.chunks[i] for i in top_idx]
|
34 |
+
return answer, sources
|
requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==4.25.0
|
2 |
+
transformers==4.40.1
|
3 |
+
sentence-transformers
|
4 |
+
langchain==0.1.20
|
5 |
+
chromadb==0.4.24
|
6 |
+
PyPDF2
|
7 |
+
python-docx
|
8 |
+
gradio
|
9 |
+
transformers
|
10 |
+
sentence-transformers
|
11 |
+
PyPDF2
|
12 |
+
python-docx
|
13 |
+
numpy
|
14 |
+
torch
|
utils.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from PyPDF2 import PdfReader
|
3 |
+
from docx import Document
|
4 |
+
|
5 |
+
def process_pdf(file_path):
|
6 |
+
reader = PdfReader(file_path)
|
7 |
+
text = ""
|
8 |
+
for page in reader.pages:
|
9 |
+
text += page.extract_text() + "\n"
|
10 |
+
return text.split('\n\n') # تقسيم النص إلى فقرات
|
11 |
+
|
12 |
+
def process_docx(file_path):
|
13 |
+
doc = Document(file_path)
|
14 |
+
paragraphs = [p.text for p in doc.paragraphs if p.text.strip() != ""]
|
15 |
+
return paragraphs
|
16 |
+
|
17 |
+
def process_txt(file_path):
|
18 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
19 |
+
text = f.read()
|
20 |
+
return text.split('\n\n')
|
21 |
+
|
22 |
+
def process_documents(file_path):
|
23 |
+
ext = os.path.splitext(file_path)[1].lower()
|
24 |
+
if ext == '.pdf':
|
25 |
+
return process_pdf(file_path)
|
26 |
+
elif ext == '.docx':
|
27 |
+
return process_docx(file_path)
|
28 |
+
elif ext == '.txt':
|
29 |
+
return process_txt(file_path)
|
30 |
+
else:
|
31 |
+
return []
|