diff --git a/docs/examples/rag_gpt_oss.md b/docs/examples/rag_gpt_oss.md new file mode 100644 index 00000000..0e1229ac --- /dev/null +++ b/docs/examples/rag_gpt_oss.md @@ -0,0 +1,66 @@ +# Minimal RAG + gpt-oss Example (FAISS Retrieval) + +This example demonstrates a simple, production-style Retrieval-Augmented Generation (RAG) pipeline using FAISS, sentence-transformers, and gpt-oss (or any OpenAI-compatible endpoint). + +**No project configs or core files are changed. All code and dependencies are local to `examples/`.** + +## Setup + +1. Install requirements (in a virtualenv): + +```sh +pip install -r examples/requirements-rag.txt +``` + +2. Set environment variables: + +- `OPENAI_API_KEY` (your key) +- `OPENAI_BASE_URL` (e.g., `http://localhost:8000/v1` for vLLM/gpt-oss) +- `GPT_OSS_MODEL` (model name, e.g., `gpt-oss-20b`) + +## Usage + +```sh +python examples/rag_gpt_oss.py --query "What is vector search?" --top_k 4 +``` + +Optional flags: +- `--rebuild-index` (force reindex) +- `--no-stream` (disable streaming) +- `--chunk-size` (default 800) +- `--chunk-overlap` (default 120) + +## What it does + +- Loads docs from `examples/data/*.{txt,md,pdf}` (PDFs require `pymupdf`) +- Builds or loads a FAISS index in `examples/data/.faiss/` +- Retrieves top-k chunks with metadata (source file, char span) +- Constructs a Harmony prompt (system guides behavior, user includes question and retrieved context, sources cited) +- Calls an OpenAI-compatible chat endpoint using the official `openai` Python SDK +- Streams output (unless `--no-stream`) +- Prints answer and compact citations list ([source:filename#chunk]) +- Saves a JSONL transcript to `examples/data/runs/{timestamp}.jsonl` + +## Example Output + +``` +Answer: Vector search is a method ... + +Sources: +[1] intro_vector_search.md +[2] embeddings_and_faiss.md +``` + +## Pointing to a Local vLLM Server + +Set `OPENAI_BASE_URL` to your vLLM/gpt-oss endpoint, e.g.: + +``` +export OPENAI_BASE_URL=http://localhost:8000/v1 +``` + +## Notes + +- This is a minimal, example-only script. It does not alter project configs or CI. +- If required packages (faiss, pymupdf) are missing, install hints are printed and the script exits cleanly. +- All code is self-contained under `examples/`, with no changes to core project files. diff --git a/examples/data/embeddings_and_faiss.md b/examples/data/embeddings_and_faiss.md new file mode 100644 index 00000000..94b1edfd --- /dev/null +++ b/examples/data/embeddings_and_faiss.md @@ -0,0 +1,7 @@ +# Embeddings and FAISS + +Embeddings are vector representations of text. FAISS is a fast library for similarity search and clustering of dense vectors. To use FAISS: + +1. Generate embeddings for your text chunks using a model like sentence-transformers/all-MiniLM-L6-v2. +2. Build a FAISS index from these vectors. +3. Retrieve top-k similar chunks for a query using cosine similarity. \ No newline at end of file diff --git a/examples/data/intro_vector_search.md b/examples/data/intro_vector_search.md new file mode 100644 index 00000000..b2b24bb0 --- /dev/null +++ b/examples/data/intro_vector_search.md @@ -0,0 +1,3 @@ +# Introduction to Vector Search + +Vector search is a technique that enables searching for information based on the semantic meaning of text, rather than exact keyword matches. It works by converting text into high-dimensional vectors (embeddings) and finding the most similar vectors using distance metrics like cosine similarity. This approach powers modern retrieval-augmented generation (RAG) systems and semantic search engines. \ No newline at end of file diff --git a/examples/data/retrieval_best_practices.txt b/examples/data/retrieval_best_practices.txt new file mode 100644 index 00000000..0559942b --- /dev/null +++ b/examples/data/retrieval_best_practices.txt @@ -0,0 +1,6 @@ +Chunking and overlap are crucial for effective retrieval: + +- Use chunk sizes that balance context (e.g., 800 characters) and retrieval granularity. +- Overlap chunks (e.g., 120 characters) to avoid missing relevant information at boundaries. +- Clean and normalize text before indexing. +- Always cite sources for transparency. \ No newline at end of file diff --git a/examples/rag_gpt_oss.py b/examples/rag_gpt_oss.py new file mode 100644 index 00000000..75da1ca9 --- /dev/null +++ b/examples/rag_gpt_oss.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python +""" +Minimal RAG + gpt-oss example using FAISS retrieval. +See docs/examples/rag_gpt_oss.md for details. +""" +import os +import sys +import time +import json +import argparse +import glob +import hashlib +import datetime +from pathlib import Path +from typing import List, Dict, Optional + +# --- Dependency checks and fallbacks --- +try: + import faiss +except ImportError: + print("[ERROR] Missing dependency: faiss-cpu. Install with: pip install faiss-cpu>=1.8", file=sys.stderr) + sys.exit(2) +try: + from sentence_transformers import SentenceTransformer +except ImportError: + print("[ERROR] Missing dependency: sentence-transformers. Install with: pip install sentence-transformers>=2.6", file=sys.stderr) + sys.exit(2) +try: + import tiktoken + def count_tokens(text): + enc = tiktoken.get_encoding("cl100k_base") + return len(enc.encode(text)) +except ImportError: + def count_tokens(text): + return len(text.encode("utf-8")) // 4 # crude fallback +try: + import fitz # pymupdf + def extract_pdf_text(path): + doc = fitz.open(path) + return "\n".join(page.get_text() for page in doc) +except ImportError: + def extract_pdf_text(path): + print("[ERROR] pymupdf not installed. Install with: pip install pymupdf>=1.24", file=sys.stderr) + sys.exit(2) +try: + from openai import OpenAI +except ImportError: + print("[ERROR] Missing dependency: openai. Install with: pip install openai>=1.40", file=sys.stderr) + sys.exit(2) + +# --- Harmony helpers --- +from examples.utils.harmony_helpers import build_harmony_messages, validate_harmony_response + +# --- Chunker --- +def recursive_chunk(text, chunk_size=800, chunk_overlap=120): + """Chunk text recursively by tokens/bytes.""" + chunks = [] + start = 0 + text_len = len(text) + while start < text_len: + end = min(start + chunk_size, text_len) + chunk = text[start:end] + chunks.append((start, end, chunk)) + if end == text_len: + break + start += chunk_size - chunk_overlap + return chunks + +# --- Doc loader --- +def load_docs(data_dir: str) -> List[Dict]: + docs = [] + for path in glob.glob(os.path.join(data_dir, '*.*')): + ext = os.path.splitext(path)[1].lower() + if ext in {'.md', '.txt'}: + with open(path, encoding='utf-8') as f: + text = f.read() + elif ext == '.pdf': + text = extract_pdf_text(path) + else: + continue + docs.append({'path': path, 'text': text}) + return docs + +# --- Indexing --- +def build_or_load_faiss(docs: List[Dict], faiss_dir: str, chunk_size: int, chunk_overlap: int, model_name: str) -> (faiss.IndexFlatIP, List[Dict]): + os.makedirs(faiss_dir, exist_ok=True) + meta_path = os.path.join(faiss_dir, 'meta.json') + index_path = os.path.join(faiss_dir, 'index.bin') + chunks_path = os.path.join(faiss_dir, 'chunks.jsonl') + # Check if index exists and is up-to-date + doc_hash = hashlib.sha1() + for doc in docs: + stat = os.stat(doc['path']) + doc_hash.update(f"{doc['path']}:{stat.st_mtime}".encode()) + hash_hex = doc_hash.hexdigest() + if os.path.exists(meta_path): + with open(meta_path) as f: + meta = json.load(f) + if meta.get('hash') == hash_hex and os.path.exists(index_path) and os.path.exists(chunks_path): + index = faiss.read_index(index_path) + with open(chunks_path) as f: + chunks = [json.loads(line) for line in f] + return index, chunks + # Rebuild index + model = SentenceTransformer(model_name) + all_chunks = [] + vectors = [] + for doc in docs: + for i, (start, end, chunk) in enumerate(recursive_chunk(doc['text'], chunk_size, chunk_overlap)): + chunk_id = f"{os.path.basename(doc['path'])}#{i}" + all_chunks.append({ + 'id': chunk_id, + 'text': chunk, + 'source': os.path.basename(doc['path']), + 'span': [start, end], + 'path': doc['path'] + }) + vectors.append(chunk) + if not all_chunks: + print("[ERROR] No chunks found for indexing.", file=sys.stderr) + sys.exit(2) + embeds = model.encode(vectors, normalize_embeddings=True, show_progress_bar=True) + dim = embeds.shape[1] + index = faiss.IndexFlatIP(dim) + index.add(embeds) + faiss.write_index(index, index_path) + with open(chunks_path, 'w', encoding='utf-8') as f: + for chunk in all_chunks: + f.write(json.dumps(chunk, ensure_ascii=False) + '\n') + with open(meta_path, 'w') as f: + json.dump({'hash': hash_hex, 'dim': dim, 'model': model_name}, f) + return index, all_chunks + +# --- Retrieval --- +def retrieve(query: str, index, chunks: List[Dict], model_name: str, top_k: int) -> List[Dict]: + model = SentenceTransformer(model_name) + qvec = model.encode([query], normalize_embeddings=True) + D, I = index.search(qvec, top_k) + results = [] + for rank, idx in enumerate(I[0]): + if idx < 0 or idx >= len(chunks): + continue + chunk = chunks[idx].copy() + chunk['score'] = float(D[0][rank]) + chunk['rank'] = rank + 1 + results.append(chunk) + return results + +# --- Main CLI --- +def main(): + parser = argparse.ArgumentParser(description="Minimal RAG + gpt-oss example (FAISS retrieval)") + parser.add_argument('--query', required=True, help='User query') + parser.add_argument('--top_k', type=int, default=4, help='Top-k chunks to retrieve') + parser.add_argument('--rebuild-index', action='store_true', help='Force rebuild FAISS index') + parser.add_argument('--no-stream', action='store_true', help='Disable streaming output') + parser.add_argument('--chunk-size', type=int, default=800, help='Chunk size (chars)') + parser.add_argument('--chunk-overlap', type=int, default=120, help='Chunk overlap (chars)') + args = parser.parse_args() + + # Env vars + api_key = os.getenv('OPENAI_API_KEY') + base_url = os.getenv('OPENAI_BASE_URL') + model = os.getenv('GPT_OSS_MODEL') + if not (api_key and base_url and model): + print("[ERROR] Set OPENAI_API_KEY, OPENAI_BASE_URL, and GPT_OSS_MODEL.", file=sys.stderr) + sys.exit(2) + + data_dir = os.path.join(os.path.dirname(__file__), 'data') + faiss_dir = os.path.join(data_dir, '.faiss') + runs_dir = os.path.join(data_dir, 'runs') + os.makedirs(runs_dir, exist_ok=True) + + docs = load_docs(data_dir) + if not docs: + print("[ERROR] No documents found in examples/data/", file=sys.stderr) + sys.exit(2) + + # Index + if args.rebuild_index: + for f in Path(faiss_dir).glob('*'): + f.unlink() + index, all_chunks = build_or_load_faiss(docs, faiss_dir, args.chunk_size, args.chunk_overlap, 'sentence-transformers/all-MiniLM-L6-v2') + if index.ntotal == 0 or not all_chunks: + print("[ERROR] FAISS index is empty.", file=sys.stderr) + sys.exit(2) + + # Retrieval + retrieved = retrieve(args.query, index, all_chunks, 'sentence-transformers/all-MiniLM-L6-v2', args.top_k) + if not retrieved: + print("[ERROR] No relevant chunks retrieved.", file=sys.stderr) + sys.exit(2) + + # Prompt + system_prompt = "You are a helpful assistant. Use ONLY the provided CONTEXT. Cite sources as [1], [2], ... Map them to filenames at the end under 'Sources'." + messages = build_harmony_messages(system_prompt, args.query, retrieved) + + # OpenAI-compatible call + client = OpenAI(base_url=base_url, api_key=api_key) + start_time = time.time() + response_text = "" + try: + stream = not args.no_stream + completion = client.chat.completions.create( + model=model, + messages=messages, + stream=stream, + temperature=0.2, + max_tokens=512 + ) + if stream: + print("\nAnswer:", end=" ", flush=True) + for chunk in completion: + delta = getattr(chunk.choices[0].delta, 'content', None) + if delta: + print(delta, end="", flush=True) + response_text += delta + print() + else: + response_text = completion.choices[0].message.content + print("\nAnswer:", response_text) + except Exception as e: + print(f"[ERROR] Model call failed: {e}", file=sys.stderr) + sys.exit(2) + latency_ms = int((time.time() - start_time) * 1000) + + # Validate response + if not validate_harmony_response(response_text): + print("[ERROR] Model returned empty or invalid response.", file=sys.stderr) + sys.exit(2) + + # Citations + print("\nSources:") + for i, chunk in enumerate(retrieved, 1): + print(f"[{i}] {chunk['source']}") + + # Save transcript + ts = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + run_path = os.path.join(runs_dir, f'{ts}.jsonl') + with open(run_path, 'w', encoding='utf-8') as f: + log = { + 'query': args.query, + 'retrieved_ids': [c['id'] for c in retrieved], + 'prompt': messages, + 'model': model, + 'latency_ms': latency_ms, + 'answer': response_text + } + f.write(json.dumps(log, ensure_ascii=False) + '\n') + # Simple inline test + assert os.path.exists(run_path) and os.path.getsize(run_path) > 0, "Transcript not saved!" + +if __name__ == '__main__': + main() diff --git a/examples/requirements-rag.txt b/examples/requirements-rag.txt new file mode 100644 index 00000000..c26f08af --- /dev/null +++ b/examples/requirements-rag.txt @@ -0,0 +1,5 @@ +faiss-cpu>=1.8 +sentence-transformers>=2.6 +pymupdf>=1.24 +tiktoken>=0.7 +openai>=1.40 diff --git a/examples/utils/harmony_helpers.py b/examples/utils/harmony_helpers.py new file mode 100644 index 00000000..9c27447c --- /dev/null +++ b/examples/utils/harmony_helpers.py @@ -0,0 +1,28 @@ +import re + +def build_harmony_messages(system_prompt: str, user_query: str, retrieved_chunks: list[dict]) -> list[dict]: + """ + Build Harmony-style messages for OpenAI-compatible chat completion. + Each chunk is cited as [n] in CONTEXT and mapped to its source. + """ + context_lines = [] + for i, chunk in enumerate(retrieved_chunks, 1): + context_lines.append(f"[{i}] {chunk['text']}") + context = "\n".join(context_lines) + user_content = f"QUESTION: {user_query}\nCONTEXT:\n{context}" + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_content}, + ] + return messages + +def validate_harmony_response(text: str) -> bool: + """ + Minimal checks: non-empty, not a tool-call JSON. + """ + if not text or not text.strip(): + return False + # Disallow tool-call JSON (e.g., starts with '{' and contains "tool_call") + if text.strip().startswith('{') and 'tool_call' in text: + return False + return True