English
danielgrims commited on
Commit
1026698
·
verified ·
1 Parent(s): 6887103

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +19 -0
  2. ask.py +7 -0
  3. benchmark_encode.py +9 -0
  4. chunk.py +4 -0
  5. chunk_creation.py +128 -0
  6. chunks_stage/wiki_00.json +0 -0
  7. chunks_stage/wiki_01.json +0 -0
  8. chunks_stage/wiki_02.json +0 -0
  9. chunks_stage/wiki_03.json +0 -0
  10. chunks_stage/wiki_04.json +0 -0
  11. chunks_stage/wiki_05.json +0 -0
  12. chunks_stage/wiki_06.json +0 -0
  13. chunks_stage/wiki_07.json +0 -0
  14. chunks_stage/wiki_08.json +0 -0
  15. chunks_stage/wiki_09.json +0 -0
  16. chunks_stage/wiki_10.json +0 -0
  17. chunks_stage/wiki_11.json +0 -0
  18. chunks_stage/wiki_12.json +0 -0
  19. chunks_stage/wiki_13.json +0 -0
  20. chunks_stage/wiki_14.json +0 -0
  21. chunks_stage/wiki_15.json +0 -0
  22. chunks_stage/wiki_16.json +0 -0
  23. chunks_stage/wiki_17.json +0 -0
  24. chunks_stage/wiki_18.json +0 -0
  25. chunks_stage/wiki_19.json +0 -0
  26. chunks_stage/wiki_20.json +0 -0
  27. chunks_stage/wiki_21.json +0 -0
  28. chunks_stage/wiki_22.json +0 -0
  29. chunks_stage/wiki_23.json +0 -0
  30. chunks_stage/wiki_24.json +0 -0
  31. chunks_stage/wiki_25.json +0 -0
  32. chunks_stage/wiki_26.json +0 -0
  33. chunks_stage/wiki_27.json +0 -0
  34. chunks_stage/wiki_28.json +0 -0
  35. chunks_stage/wiki_29.json +0 -0
  36. chunks_stage/wiki_30.json +0 -0
  37. chunks_stage/wiki_31.json +0 -0
  38. chunks_stage/wiki_32.json +0 -0
  39. chunks_stage/wiki_33.json +0 -0
  40. chunks_stage/wiki_34.json +0 -0
  41. chunks_stage/wiki_35.json +0 -0
  42. chunks_stage/wiki_36.json +0 -0
  43. chunks_stage/wiki_37.json +0 -0
  44. chunks_stage/wiki_38.json +0 -0
  45. chunks_stage/wiki_39.json +0 -0
  46. chunks_stage/wiki_40.json +0 -0
  47. chunks_stage/wiki_41.json +0 -0
  48. chunks_stage/wiki_42.json +0 -0
  49. chunks_stage/wiki_43.json +0 -0
  50. chunks_stage/wiki_44.json +0 -0
.gitattributes CHANGED
@@ -33,3 +33,22 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ syvaai_env/lib/python3.13/site-packages/pip/_vendor/distlib/t64-arm.exe filter=lfs diff=lfs merge=lfs -text
37
+ syvaai_env/lib/python3.13/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text
38
+ syvaai_env/lib/python3.13/site-packages/pip/_vendor/distlib/w64-arm.exe filter=lfs diff=lfs merge=lfs -text
39
+ syvaai_env/lib/python3.13/site-packages/pip/_vendor/distlib/w64.exe filter=lfs diff=lfs merge=lfs -text
40
+ syvaai_env/lib/python3.13/site-packages/pip/_vendor/idna/__pycache__/uts46data.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
41
+ syvaai_env/lib/python3.13/site-packages/pip/_vendor/pkg_resources/__pycache__/__init__.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
42
+ syvaai_env/lib/python3.13/site-packages/pip/_vendor/rich/__pycache__/_emoji_codes.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
43
+ syvaai_env/lib/python3.13/site-packages/pip/_vendor/rich/__pycache__/console.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
44
+ syvaai_env/lib64/python3.13/site-packages/pip/_vendor/distlib/t64-arm.exe filter=lfs diff=lfs merge=lfs -text
45
+ syvaai_env/lib64/python3.13/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text
46
+ syvaai_env/lib64/python3.13/site-packages/pip/_vendor/distlib/w64-arm.exe filter=lfs diff=lfs merge=lfs -text
47
+ syvaai_env/lib64/python3.13/site-packages/pip/_vendor/distlib/w64.exe filter=lfs diff=lfs merge=lfs -text
48
+ syvaai_env/lib64/python3.13/site-packages/pip/_vendor/idna/__pycache__/uts46data.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
49
+ syvaai_env/lib64/python3.13/site-packages/pip/_vendor/pkg_resources/__pycache__/__init__.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
50
+ syvaai_env/lib64/python3.13/site-packages/pip/_vendor/rich/__pycache__/_emoji_codes.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
51
+ syvaai_env/lib64/python3.13/site-packages/pip/_vendor/rich/__pycache__/console.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
52
+ wiki_chunks.json filter=lfs diff=lfs merge=lfs -text
53
+ wiki_faiss.index filter=lfs diff=lfs merge=lfs -text
54
+ wiki_texts.txt filter=lfs diff=lfs merge=lfs -text
ask.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ resp = requests.post(
4
+ "http://0.0.0.0:8000/ask",
5
+ json={"question": "What is capital of France?", "top_k": 5}
6
+ )
7
+ print(resp.json())
benchmark_encode.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import time, numpy as np
3
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
4
+ texts = ["This is a test."] * 1000 # batch you plan to use
5
+ t0 = time.time()
6
+ emb = model.encode(texts, convert_to_numpy=True)
7
+ t1 = time.time()
8
+ print("seconds for 1000:", t1-t0)
9
+ print("shape:", emb.shape)
chunk.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import faiss
2
+
3
+ index = faiss.read_index("wiki_faiss.index")
4
+ print("Total vectors:", index.ntotal)
chunk_creation.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json
2
+ from pathlib import Path
3
+ from tqdm import tqdm
4
+ import faiss
5
+ import numpy as np
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+ DUMP_PATH = "/home/ubuntu/output"
9
+ FAISS_OUT = "wiki_faiss.index"
10
+ STATE_FILE = "progress.json"
11
+ PAUSE_FLAG = "PAUSE"
12
+ CHUNK_SIZE = 200
13
+ BATCH_SIZE = 1000
14
+ CHECKPOINT_BATCHES = 5
15
+
16
+ # Load model and FAISS index
17
+ embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
18
+ dim = embedder.get_sentence_embedding_dimension()
19
+ if Path(FAISS_OUT).exists():
20
+ index = faiss.read_index(FAISS_OUT)
21
+ else:
22
+ index = faiss.IndexFlatIP(dim)
23
+
24
+ # Gather all files
25
+ files = [os.path.join(r,f) for r,_,fs in os.walk(DUMP_PATH) for f in fs if f.startswith("wiki_")]
26
+ total_files = len(files)
27
+
28
+ # Load progress
29
+ if Path(STATE_FILE).exists():
30
+ with open(STATE_FILE) as f:
31
+ state = json.load(f)
32
+ file_idx = state.get("file_idx", 0)
33
+ batch_idx = state.get("batch_idx", 0)
34
+ print(f"▶ Resuming from file {file_idx}, batch {batch_idx}")
35
+ else:
36
+ file_idx = 0
37
+ batch_idx = 0
38
+
39
+ # Helper: split text into chunks
40
+ def chunk_text(text, size=CHUNK_SIZE):
41
+ words = text.split()
42
+ for i in range(0, len(words), size):
43
+ yield " ".join(words[i:i+size])
44
+
45
+ # --- Precompute total chunks and already processed chunks for overall progress bar ---
46
+ file_chunk_counts = []
47
+ total_chunks = 0
48
+ for f in files:
49
+ cnt = 0
50
+ try:
51
+ with open(f, "r", encoding="utf-8") as file:
52
+ for line in file:
53
+ data = json.loads(line)
54
+ text = data.get("text", "").strip()
55
+ if text:
56
+ cnt += len(list(chunk_text(text)))
57
+ except:
58
+ pass
59
+ file_chunk_counts.append(cnt)
60
+ total_chunks += cnt
61
+
62
+ # Already processed chunks
63
+ processed_chunks = sum(file_chunk_counts[:file_idx]) + batch_idx
64
+
65
+ # Overall progress bar
66
+ pbar = tqdm(total=total_chunks, initial=processed_chunks, desc="Embedding chunks", unit="chunk")
67
+
68
+ # --- Main loop ---
69
+ for f_idx in range(file_idx, total_files):
70
+ file_path = files[f_idx]
71
+
72
+ # Pause check
73
+ if Path(PAUSE_FLAG).exists():
74
+ print("\n⏸ Pause requested. Saving state...")
75
+ faiss.write_index(index, FAISS_OUT)
76
+ with open(STATE_FILE, "w") as f:
77
+ json.dump({"file_idx": f_idx, "batch_idx": batch_idx}, f)
78
+ exit(0)
79
+
80
+ # Read file
81
+ chunks = []
82
+ try:
83
+ with open(file_path, "r", encoding="utf-8") as f:
84
+ for line in f:
85
+ data = json.loads(line)
86
+ text = data.get("text", "").strip()
87
+ if text:
88
+ chunks.extend(list(chunk_text(text)))
89
+ except Exception as e:
90
+ print(f"Error reading {file_path}: {e}")
91
+ continue
92
+
93
+ start = batch_idx if f_idx == file_idx else 0
94
+ total_chunks_in_file = len(chunks)
95
+
96
+ # Process chunks in batches
97
+ for b_idx in range(start, total_chunks_in_file, BATCH_SIZE):
98
+ if Path(PAUSE_FLAG).exists():
99
+ print("\n⏸ Pause requested. Saving state...")
100
+ faiss.write_index(index, FAISS_OUT)
101
+ with open(STATE_FILE, "w") as f:
102
+ json.dump({"file_idx": f_idx, "batch_idx": b_idx}, f)
103
+ exit(0)
104
+
105
+ batch_texts = chunks[b_idx:b_idx+BATCH_SIZE]
106
+ embeddings = embedder.encode(batch_texts, convert_to_numpy=True, dtype=np.float32)
107
+ faiss.normalize_L2(embeddings)
108
+ index.add(embeddings)
109
+
110
+ # Update overall progress bar
111
+ pbar.update(len(batch_texts))
112
+
113
+ # Checkpoint
114
+ if (b_idx // BATCH_SIZE + 1) % CHECKPOINT_BATCHES == 0:
115
+ faiss.write_index(index, FAISS_OUT)
116
+ with open(STATE_FILE, "w") as f:
117
+ json.dump({"file_idx": f_idx, "batch_idx": b_idx + BATCH_SIZE}, f)
118
+
119
+ # Finished file
120
+ batch_idx = 0
121
+ faiss.write_index(index, FAISS_OUT)
122
+ with open(STATE_FILE, "w") as f:
123
+ json.dump({"file_idx": f_idx+1, "batch_idx": 0}, f)
124
+
125
+ pbar.close()
126
+ print("✅ All files processed.")
127
+ if Path(PAUSE_FLAG).exists():
128
+ os.remove(PAUSE_FLAG)
chunks_stage/wiki_00.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_01.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_02.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_03.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_04.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_05.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_06.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_07.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_08.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_09.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_10.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_11.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_12.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_13.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_14.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_15.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_16.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_17.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_18.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_19.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_20.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_21.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_22.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_23.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_24.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_25.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_26.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_27.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_28.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_29.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_30.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_31.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_32.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_33.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_34.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_35.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_36.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_37.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_38.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_39.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_40.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_41.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_42.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_43.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks_stage/wiki_44.json ADDED
The diff for this file is too large to render. See raw diff