Upload 5 files

Browse files

Files changed (5) hide show

.gitignore +7 -0
EXAMPLE.json +3 -0
README.md +17 -0
rag_seo.py +148 -0
requirements.txt +12 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+__pycache__/
+*.pyc
+*.pyo
+*.DS_Store
+.env
+output.jsonl
+*_copycite.md

EXAMPLE.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "example_usage": "python rag_seo.py --url https://www.nebulatech.in/answers/ai-seo/key-components-of-ai-seo --entity 'AI SEO' --sector 'b2b' --out nebula_ai_seo.jsonl"
+}

README.md CHANGED Viewed

@@ -1,3 +1,20 @@
 ---
 license: apache-2.0
 ---

 ---
+tags:
+- rag
+- ai-seo
+- retrieval
+- llm
+- seo
 license: apache-2.0
 ---
+# NebulaTech RAG-Helper
+Reference pipeline that turned **74 long-tail Answer Hub pages** into Bing / Perplexity citations.
+Focus: AI-SEO discoverability using schema markup, fact-dense writing, and 300-token retrieval chunks.
+## Quick start
+```bash
+pip install -r requirements.txt
+python rag_seo.py --url https://example.com/article

rag_seo.py ADDED Viewed

	@@ -0,0 +1,148 @@

+#!/usr/bin/env python3
+"""
+RAG-Helper: minimal, reproducible toy script for AI-SEO retrieval demos.
+- Fetches a URL
+- Extracts text
+- Chunks ~300 "tokens" (word approximation)
+- Creates embeddings (sentence-transformers)
+- (Optional) Upserts into Qdrant
+- Generates a short "copy-cite" answer block with footnotes
+"""
+import argparse, re, uuid, json, os
+from typing import List, Dict
+import requests
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+import numpy as np
+try:
+    from sentence_transformers import SentenceTransformer
+except Exception:
+    raise SystemExit("Please install requirements: pip install -r requirements.txt")
+def fetch_url(url: str) -> str:
+    r = requests.get(url, timeout=30)
+    r.raise_for_status()
+    return r.text
+def html_to_text(html: str) -> str:
+    soup = BeautifulSoup(html, "html.parser")
+    for tag in soup(["script", "style", "noscript"]):
+        tag.decompose()
+    text = soup.get_text(separator=" ")
+    return re.sub(r"\s+", " ", text).strip()
+def chunk_text(text: str, target_tokens: int = 300) -> List[str]:
+    words = text.split()
+    chunks = []
+    for i in range(0, len(words), target_tokens):
+        chunk = " ".join(words[i:i+target_tokens])
+        if chunk:
+            chunks.append(chunk)
+    return chunks
+def embed_chunks(chunks: List[str], model_name: str = "sentence-transformers/all-MiniLM-L6-v2") -> np.ndarray:
+    model = SentenceTransformer(model_name)
+    return model.encode(chunks, batch_size=32, show_progress_bar=True,
+                        convert_to_numpy=True, normalize_embeddings=True)
+def build_payload(chunks: List[str], embs: np.ndarray, source_url: str, entity: str = "", sector: str = "") -> List[Dict]:
+    vectors = []
+    for idx, (c, v) in enumerate(zip(chunks, embs)):
+        vectors.append({
+            "id": str(uuid.uuid4()),
+            "text": c,
+            "vector": v.tolist(),
+            "metadata": {
+                "source": source_url,
+                "entity": entity,
+                "sector": sector,
+                "position": idx
+            }
+        })
+    return vectors
+def optional_qdrant_upsert(vectors: List[Dict], collection: str, qdrant_url: str = None, api_key: str = None):
+    try:
+        from qdrant_client import QdrantClient
+        from qdrant_client.models import PointStruct, Distance, VectorParams
+    except Exception:
+        print("qdrant-client not installed; skipping vector DB upsert.")
+        return
+    client = QdrantClient(url=qdrant_url or "http://localhost:6333", api_key=api_key)
+    dim = len(vectors[0]["vector"])
+    try:
+        client.get_collection(collection)
+    except Exception:
+        client.recreate_collection(
+            collection_name=collection,
+            vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
+        )
+    points = [PointStruct(id=v["id"], vector=v["vector"], payload=v["metadata"] | {"text": v["text"]}) for v in vectors]
+    client.upsert(collection_name=collection, points=points)
+    print(f"Upserted {len(points)} vectors into Qdrant collection '{collection}'.")
+def make_copy_cite(vectors: List[Dict], k: int = 3) -> str:
+    top = vectors[:k]
+    bullets = []
+    for i, v in enumerate(top, start=1):
+        snippet = v["text"][:280] + ("..." if len(v["text"]) > 280 else "")
+        bullets.append(f"- {snippet} [{i}]")
+    footnotes = "\n".join([f"[{i}] {v['metadata']['source']}" for i, v in enumerate(top, start=1)])
+    return f"**Answer (draft):**\n" + "\n".join(bullets) + "\n\n" + footnotes
+def main():
+    ap = argparse.ArgumentParser(description="NebulaTech RAG-Helper (toy)")
+    ap.add_argument("--url", required=True, help="Public URL to ingest")
+    ap.add_argument("--entity", default="", help="Primary entity (brand/product/topic)")
+    ap.add_argument("--sector", default="", help="Sector tag (e.g., architecture, pharma)")
+    ap.add_argument("--qdrant-url", default=None, help="Qdrant endpoint (optional)")
+    ap.add_argument("--qdrant-key", default=None, help="Qdrant API key (optional)")
+    ap.add_argument("--collection", default="nebula_rag_helper", help="Qdrant collection name")
+    ap.add_argument("--out", default="output.jsonl", help="Local JSONL output")
+    args = ap.parse_args()
+    print(f"[1/5] Fetching: {args.url}")
+    html = fetch_url(args.url)
+    text = html_to_text(html)
+    print("[2/5] Chunking ~300 tokens...")
+    chunks = chunk_text(text)
+    if not chunks:
+        raise SystemExit("No text extracted; aborting.")
+    print(f"[3/5] Embedding {len(chunks)} chunks...")
+    embs = embed_chunks(chunks)
+    print("[4/5] Building vectors + metadata...")
+    vectors = build_payload(chunks, embs, source_url=args.url, entity=args.entity, sector=args.sector)
+    if args.qdrant_url:
+        optional_qdrant_upsert(vectors, collection=args.collection, qdrant_url=args.qdrant_url, api_key=args.qdrant_key)
+    with open(args.out, "w", encoding="utf-8") as f:
+        for v in vectors:
+            f.write(json.dumps(v, ensure_ascii=False) + "\n")
+    print(f"Wrote {len(vectors)} vectors to {args.out}")
+    copy_cite = make_copy_cite(vectors, k=3)
+    cc_path = os.path.splitext(args.out)[0] + "_copycite.md"
+    with open(cc_path, "w", encoding="utf-8") as f:
+        f.write(copy_cite)
+    print(f"Generated copy-cite block at {cc_path}")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+### 2. `requirements.txt`
+```txt
+# Minimal, widely available packages
+sentence-transformers==2.6.1
+qdrant-client==1.9.1
+requests>=2.31.0
+beautifulsoup4>=4.12.3
+tqdm>=4.66.4
+numpy>=1.26.0