nebulatech-in commited on
Commit
bff8572
·
verified ·
1 Parent(s): ef93e02

Upload 5 files

Browse files
Files changed (5) hide show
  1. .gitignore +7 -0
  2. EXAMPLE.json +3 -0
  3. README.md +17 -0
  4. rag_seo.py +148 -0
  5. requirements.txt +12 -0
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.DS_Store
5
+ .env
6
+ output.jsonl
7
+ *_copycite.md
EXAMPLE.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "example_usage": "python rag_seo.py --url https://www.nebulatech.in/answers/ai-seo/key-components-of-ai-seo --entity 'AI SEO' --sector 'b2b' --out nebula_ai_seo.jsonl"
3
+ }
README.md CHANGED
@@ -1,3 +1,20 @@
1
  ---
 
 
 
 
 
 
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - rag
4
+ - ai-seo
5
+ - retrieval
6
+ - llm
7
+ - seo
8
  license: apache-2.0
9
  ---
10
+
11
+ # NebulaTech RAG-Helper
12
+
13
+ Reference pipeline that turned **74 long-tail Answer Hub pages** into Bing / Perplexity citations.
14
+ Focus: AI-SEO discoverability using schema markup, fact-dense writing, and 300-token retrieval chunks.
15
+
16
+ ## Quick start
17
+
18
+ ```bash
19
+ pip install -r requirements.txt
20
+ python rag_seo.py --url https://example.com/article
rag_seo.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ RAG-Helper: minimal, reproducible toy script for AI-SEO retrieval demos.
4
+ - Fetches a URL
5
+ - Extracts text
6
+ - Chunks ~300 "tokens" (word approximation)
7
+ - Creates embeddings (sentence-transformers)
8
+ - (Optional) Upserts into Qdrant
9
+ - Generates a short "copy-cite" answer block with footnotes
10
+ """
11
+
12
+ import argparse, re, uuid, json, os
13
+ from typing import List, Dict
14
+ import requests
15
+ from bs4 import BeautifulSoup
16
+ from tqdm import tqdm
17
+ import numpy as np
18
+
19
+ try:
20
+ from sentence_transformers import SentenceTransformer
21
+ except Exception:
22
+ raise SystemExit("Please install requirements: pip install -r requirements.txt")
23
+
24
+
25
+ def fetch_url(url: str) -> str:
26
+ r = requests.get(url, timeout=30)
27
+ r.raise_for_status()
28
+ return r.text
29
+
30
+
31
+ def html_to_text(html: str) -> str:
32
+ soup = BeautifulSoup(html, "html.parser")
33
+ for tag in soup(["script", "style", "noscript"]):
34
+ tag.decompose()
35
+ text = soup.get_text(separator=" ")
36
+ return re.sub(r"\s+", " ", text).strip()
37
+
38
+
39
+ def chunk_text(text: str, target_tokens: int = 300) -> List[str]:
40
+ words = text.split()
41
+ chunks = []
42
+ for i in range(0, len(words), target_tokens):
43
+ chunk = " ".join(words[i:i+target_tokens])
44
+ if chunk:
45
+ chunks.append(chunk)
46
+ return chunks
47
+
48
+
49
+ def embed_chunks(chunks: List[str], model_name: str = "sentence-transformers/all-MiniLM-L6-v2") -> np.ndarray:
50
+ model = SentenceTransformer(model_name)
51
+ return model.encode(chunks, batch_size=32, show_progress_bar=True,
52
+ convert_to_numpy=True, normalize_embeddings=True)
53
+
54
+
55
+ def build_payload(chunks: List[str], embs: np.ndarray, source_url: str, entity: str = "", sector: str = "") -> List[Dict]:
56
+ vectors = []
57
+ for idx, (c, v) in enumerate(zip(chunks, embs)):
58
+ vectors.append({
59
+ "id": str(uuid.uuid4()),
60
+ "text": c,
61
+ "vector": v.tolist(),
62
+ "metadata": {
63
+ "source": source_url,
64
+ "entity": entity,
65
+ "sector": sector,
66
+ "position": idx
67
+ }
68
+ })
69
+ return vectors
70
+
71
+
72
+ def optional_qdrant_upsert(vectors: List[Dict], collection: str, qdrant_url: str = None, api_key: str = None):
73
+ try:
74
+ from qdrant_client import QdrantClient
75
+ from qdrant_client.models import PointStruct, Distance, VectorParams
76
+ except Exception:
77
+ print("qdrant-client not installed; skipping vector DB upsert.")
78
+ return
79
+
80
+ client = QdrantClient(url=qdrant_url or "http://localhost:6333", api_key=api_key)
81
+ dim = len(vectors[0]["vector"])
82
+
83
+ try:
84
+ client.get_collection(collection)
85
+ except Exception:
86
+ client.recreate_collection(
87
+ collection_name=collection,
88
+ vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
89
+ )
90
+
91
+ points = [PointStruct(id=v["id"], vector=v["vector"], payload=v["metadata"] | {"text": v["text"]}) for v in vectors]
92
+ client.upsert(collection_name=collection, points=points)
93
+ print(f"Upserted {len(points)} vectors into Qdrant collection '{collection}'.")
94
+
95
+
96
+ def make_copy_cite(vectors: List[Dict], k: int = 3) -> str:
97
+ top = vectors[:k]
98
+ bullets = []
99
+ for i, v in enumerate(top, start=1):
100
+ snippet = v["text"][:280] + ("..." if len(v["text"]) > 280 else "")
101
+ bullets.append(f"- {snippet} [{i}]")
102
+ footnotes = "\n".join([f"[{i}] {v['metadata']['source']}" for i, v in enumerate(top, start=1)])
103
+ return f"**Answer (draft):**\n" + "\n".join(bullets) + "\n\n" + footnotes
104
+
105
+
106
+ def main():
107
+ ap = argparse.ArgumentParser(description="NebulaTech RAG-Helper (toy)")
108
+ ap.add_argument("--url", required=True, help="Public URL to ingest")
109
+ ap.add_argument("--entity", default="", help="Primary entity (brand/product/topic)")
110
+ ap.add_argument("--sector", default="", help="Sector tag (e.g., architecture, pharma)")
111
+ ap.add_argument("--qdrant-url", default=None, help="Qdrant endpoint (optional)")
112
+ ap.add_argument("--qdrant-key", default=None, help="Qdrant API key (optional)")
113
+ ap.add_argument("--collection", default="nebula_rag_helper", help="Qdrant collection name")
114
+ ap.add_argument("--out", default="output.jsonl", help="Local JSONL output")
115
+ args = ap.parse_args()
116
+
117
+ print(f"[1/5] Fetching: {args.url}")
118
+ html = fetch_url(args.url)
119
+ text = html_to_text(html)
120
+
121
+ print("[2/5] Chunking ~300 tokens...")
122
+ chunks = chunk_text(text)
123
+ if not chunks:
124
+ raise SystemExit("No text extracted; aborting.")
125
+
126
+ print(f"[3/5] Embedding {len(chunks)} chunks...")
127
+ embs = embed_chunks(chunks)
128
+
129
+ print("[4/5] Building vectors + metadata...")
130
+ vectors = build_payload(chunks, embs, source_url=args.url, entity=args.entity, sector=args.sector)
131
+
132
+ if args.qdrant_url:
133
+ optional_qdrant_upsert(vectors, collection=args.collection, qdrant_url=args.qdrant_url, api_key=args.qdrant_key)
134
+
135
+ with open(args.out, "w", encoding="utf-8") as f:
136
+ for v in vectors:
137
+ f.write(json.dumps(v, ensure_ascii=False) + "\n")
138
+ print(f"Wrote {len(vectors)} vectors to {args.out}")
139
+
140
+ copy_cite = make_copy_cite(vectors, k=3)
141
+ cc_path = os.path.splitext(args.out)[0] + "_copycite.md"
142
+ with open(cc_path, "w", encoding="utf-8") as f:
143
+ f.write(copy_cite)
144
+ print(f"Generated copy-cite block at {cc_path}")
145
+
146
+
147
+ if __name__ == "__main__":
148
+ main()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+
4
+ ### 2. `requirements.txt`
5
+ ```txt
6
+ # Minimal, widely available packages
7
+ sentence-transformers==2.6.1
8
+ qdrant-client==1.9.1
9
+ requests>=2.31.0
10
+ beautifulsoup4>=4.12.3
11
+ tqdm>=4.66.4
12
+ numpy>=1.26.0