Nicolai Berk commited on
Commit
f499a63
·
1 Parent(s): 253a65f

Fix corpus ids

Browse files
Files changed (1) hide show
  1. app.py +8 -1
app.py CHANGED
@@ -28,7 +28,14 @@ if hf_token:
28
  # Load corpus
29
  print("Loading dataset...")
30
  dataset = load_dataset("rag-datasets/rag-mini-wikipedia", "text-corpus")
31
- corpus = [item for item in dataset["passages"]]
 
 
 
 
 
 
 
32
 
33
  # Embedding model
34
  print("Encoding corpus...")
 
28
  # Load corpus
29
  print("Loading dataset...")
30
  dataset = load_dataset("rag-datasets/rag-mini-wikipedia", "text-corpus")
31
+ # corpus = [item for item in dataset["passages"]]
32
+
33
+ # Always clean + use this corpus consistently
34
+ corpus = []
35
+ for item in dataset["passages"]:
36
+ text = str(item).strip()
37
+ if text:
38
+ corpus.append(text)
39
 
40
  # Embedding model
41
  print("Encoding corpus...")