Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +19 -0
ask.py +7 -0
benchmark_encode.py +9 -0
chunk.py +4 -0
chunk_creation.py +128 -0
chunks_stage/wiki_00.json +0 -0
chunks_stage/wiki_01.json +0 -0
chunks_stage/wiki_02.json +0 -0
chunks_stage/wiki_03.json +0 -0
chunks_stage/wiki_04.json +0 -0
chunks_stage/wiki_05.json +0 -0
chunks_stage/wiki_06.json +0 -0
chunks_stage/wiki_07.json +0 -0
chunks_stage/wiki_08.json +0 -0
chunks_stage/wiki_09.json +0 -0
chunks_stage/wiki_10.json +0 -0
chunks_stage/wiki_11.json +0 -0
chunks_stage/wiki_12.json +0 -0
chunks_stage/wiki_13.json +0 -0
chunks_stage/wiki_14.json +0 -0
chunks_stage/wiki_15.json +0 -0
chunks_stage/wiki_16.json +0 -0
chunks_stage/wiki_17.json +0 -0
chunks_stage/wiki_18.json +0 -0
chunks_stage/wiki_19.json +0 -0
chunks_stage/wiki_20.json +0 -0
chunks_stage/wiki_21.json +0 -0
chunks_stage/wiki_22.json +0 -0
chunks_stage/wiki_23.json +0 -0
chunks_stage/wiki_24.json +0 -0
chunks_stage/wiki_25.json +0 -0
chunks_stage/wiki_26.json +0 -0
chunks_stage/wiki_27.json +0 -0
chunks_stage/wiki_28.json +0 -0
chunks_stage/wiki_29.json +0 -0
chunks_stage/wiki_30.json +0 -0
chunks_stage/wiki_31.json +0 -0
chunks_stage/wiki_32.json +0 -0
chunks_stage/wiki_33.json +0 -0
chunks_stage/wiki_34.json +0 -0
chunks_stage/wiki_35.json +0 -0
chunks_stage/wiki_36.json +0 -0
chunks_stage/wiki_37.json +0 -0
chunks_stage/wiki_38.json +0 -0
chunks_stage/wiki_39.json +0 -0
chunks_stage/wiki_40.json +0 -0
chunks_stage/wiki_41.json +0 -0
chunks_stage/wiki_42.json +0 -0
chunks_stage/wiki_43.json +0 -0
chunks_stage/wiki_44.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,22 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+syvaai_env/lib/python3.13/site-packages/pip/_vendor/distlib/t64-arm.exe filter=lfs diff=lfs merge=lfs -text
+syvaai_env/lib/python3.13/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text
+syvaai_env/lib/python3.13/site-packages/pip/_vendor/distlib/w64-arm.exe filter=lfs diff=lfs merge=lfs -text
+syvaai_env/lib/python3.13/site-packages/pip/_vendor/distlib/w64.exe filter=lfs diff=lfs merge=lfs -text
+syvaai_env/lib/python3.13/site-packages/pip/_vendor/idna/__pycache__/uts46data.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
+syvaai_env/lib/python3.13/site-packages/pip/_vendor/pkg_resources/__pycache__/__init__.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
+syvaai_env/lib/python3.13/site-packages/pip/_vendor/rich/__pycache__/_emoji_codes.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
+syvaai_env/lib/python3.13/site-packages/pip/_vendor/rich/__pycache__/console.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
+syvaai_env/lib64/python3.13/site-packages/pip/_vendor/distlib/t64-arm.exe filter=lfs diff=lfs merge=lfs -text
+syvaai_env/lib64/python3.13/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text
+syvaai_env/lib64/python3.13/site-packages/pip/_vendor/distlib/w64-arm.exe filter=lfs diff=lfs merge=lfs -text
+syvaai_env/lib64/python3.13/site-packages/pip/_vendor/distlib/w64.exe filter=lfs diff=lfs merge=lfs -text
+syvaai_env/lib64/python3.13/site-packages/pip/_vendor/idna/__pycache__/uts46data.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
+syvaai_env/lib64/python3.13/site-packages/pip/_vendor/pkg_resources/__pycache__/__init__.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
+syvaai_env/lib64/python3.13/site-packages/pip/_vendor/rich/__pycache__/_emoji_codes.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
+syvaai_env/lib64/python3.13/site-packages/pip/_vendor/rich/__pycache__/console.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
+wiki_chunks.json filter=lfs diff=lfs merge=lfs -text
+wiki_faiss.index filter=lfs diff=lfs merge=lfs -text
+wiki_texts.txt filter=lfs diff=lfs merge=lfs -text

ask.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import requests
+resp = requests.post(
+    "http://0.0.0.0:8000/ask",
+    json={"question": "What is capital of France?", "top_k": 5}
+)
+print(resp.json())

benchmark_encode.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from sentence_transformers import SentenceTransformer
+import time, numpy as np
+model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+texts = ["This is a test."] * 1000   # batch you plan to use
+t0 = time.time()
+emb = model.encode(texts, convert_to_numpy=True)
+t1 = time.time()
+print("seconds for 1000:", t1-t0)
+print("shape:", emb.shape)

chunk.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import faiss
+index = faiss.read_index("wiki_faiss.index")
+print("Total vectors:", index.ntotal)

chunk_creation.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import os, json
+from pathlib import Path
+from tqdm import tqdm
+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer
+DUMP_PATH   = "/home/ubuntu/output"
+FAISS_OUT   = "wiki_faiss.index"
+STATE_FILE  = "progress.json"
+PAUSE_FLAG  = "PAUSE"
+CHUNK_SIZE  = 200
+BATCH_SIZE  = 1000
+CHECKPOINT_BATCHES = 5
+# Load model and FAISS index
+embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+dim = embedder.get_sentence_embedding_dimension()
+if Path(FAISS_OUT).exists():
+    index = faiss.read_index(FAISS_OUT)
+else:
+    index = faiss.IndexFlatIP(dim)
+# Gather all files
+files = [os.path.join(r,f) for r,_,fs in os.walk(DUMP_PATH) for f in fs if f.startswith("wiki_")]
+total_files = len(files)
+# Load progress
+if Path(STATE_FILE).exists():
+    with open(STATE_FILE) as f:
+        state = json.load(f)
+    file_idx = state.get("file_idx", 0)
+    batch_idx = state.get("batch_idx", 0)
+    print(f"▶ Resuming from file {file_idx}, batch {batch_idx}")
+else:
+    file_idx = 0
+    batch_idx = 0
+# Helper: split text into chunks
+def chunk_text(text, size=CHUNK_SIZE):
+    words = text.split()
+    for i in range(0, len(words), size):
+        yield " ".join(words[i:i+size])
+# --- Precompute total chunks and already processed chunks for overall progress bar ---
+file_chunk_counts = []
+total_chunks = 0
+for f in files:
+    cnt = 0
+    try:
+        with open(f, "r", encoding="utf-8") as file:
+            for line in file:
+                data = json.loads(line)
+                text = data.get("text", "").strip()
+                if text:
+                    cnt += len(list(chunk_text(text)))
+    except:
+        pass
+    file_chunk_counts.append(cnt)
+    total_chunks += cnt
+# Already processed chunks
+processed_chunks = sum(file_chunk_counts[:file_idx]) + batch_idx
+# Overall progress bar
+pbar = tqdm(total=total_chunks, initial=processed_chunks, desc="Embedding chunks", unit="chunk")
+# --- Main loop ---
+for f_idx in range(file_idx, total_files):
+    file_path = files[f_idx]
+    # Pause check
+    if Path(PAUSE_FLAG).exists():
+        print("\n⏸ Pause requested. Saving state...")
+        faiss.write_index(index, FAISS_OUT)
+        with open(STATE_FILE, "w") as f:
+            json.dump({"file_idx": f_idx, "batch_idx": batch_idx}, f)
+        exit(0)
+    # Read file
+    chunks = []
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            for line in f:
+                data = json.loads(line)
+                text = data.get("text", "").strip()
+                if text:
+                    chunks.extend(list(chunk_text(text)))
+    except Exception as e:
+        print(f"Error reading {file_path}: {e}")
+        continue
+    start = batch_idx if f_idx == file_idx else 0
+    total_chunks_in_file = len(chunks)
+    # Process chunks in batches
+    for b_idx in range(start, total_chunks_in_file, BATCH_SIZE):
+        if Path(PAUSE_FLAG).exists():
+            print("\n⏸ Pause requested. Saving state...")
+            faiss.write_index(index, FAISS_OUT)
+            with open(STATE_FILE, "w") as f:
+                json.dump({"file_idx": f_idx, "batch_idx": b_idx}, f)
+            exit(0)
+        batch_texts = chunks[b_idx:b_idx+BATCH_SIZE]
+        embeddings = embedder.encode(batch_texts, convert_to_numpy=True, dtype=np.float32)
+        faiss.normalize_L2(embeddings)
+        index.add(embeddings)
+        # Update overall progress bar
+        pbar.update(len(batch_texts))
+        # Checkpoint
+        if (b_idx // BATCH_SIZE + 1) % CHECKPOINT_BATCHES == 0:
+            faiss.write_index(index, FAISS_OUT)
+            with open(STATE_FILE, "w") as f:
+                json.dump({"file_idx": f_idx, "batch_idx": b_idx + BATCH_SIZE}, f)
+    # Finished file
+    batch_idx = 0
+    faiss.write_index(index, FAISS_OUT)
+    with open(STATE_FILE, "w") as f:
+        json.dump({"file_idx": f_idx+1, "batch_idx": 0}, f)
+pbar.close()
+print("✅ All files processed.")
+if Path(PAUSE_FLAG).exists():
+    os.remove(PAUSE_FLAG)

chunks_stage/wiki_00.json ADDED Viewed