diff --git a/.gitignore b/.gitignore index 5eaa7cdd9526e1f38be67985eecc76da96db8712..cd41b5712ce26752b1c121841bbcad74996fd843 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,8 @@ # Temporary files tmp/ temp/ + +# Local generated artifacts +doc-2.0-sources/ +data/ +__pycache__/ diff --git a/README.md b/README.md index 5416b4e8ba292e75ac0c868edde97b19e7dd3667..ffa2aaabd86859bc43628523548e5f0b0eacad2a 100644 --- a/README.md +++ b/README.md @@ -110,3 +110,78 @@ docker compose up -d - [ClickHouse MCP](https://github.com/ClickHouse/mcp-clickhouse) — MCP server for ClickHouse - [LibreChat](https://github.com/danny-avila/LibreChat) — Chat UI - [Langfuse](https://langfuse.com) — LLM observability + +## Alternative: Local RAG System (ClickHouse + Ollama) + +This repository also includes an alternative Python-based RAG workflow focused on local inference with Ollama and vector search in ClickHouse (see `rag_system_alternative.py` and `ReadMe.txt`). + +### Features + +- Vector search using embeddings in ClickHouse +- Optional reranking for improved answer quality +- Query caching for faster repeated requests +- Multi-document PDF processing +- Benchmarking and export helpers (CSV/JSON/Parquet) + +### Requirements + +#### System + +| Component | Minimum | Recommended | +|---|---|---| +| CPU | 4 cores | 8+ cores | +| RAM | 16 GB | 32+ GB | +| Disk | 10 GB | 20+ GB | +| GPU | Optional | NVIDIA 8 GB VRAM | + +#### Software + +- Python 3.10+ +- ClickHouse (Cloud or local) +- Ollama installed locally + +### Quick Start (Local RAG) + +1. Install dependencies: + +```bash +pip install clickhouse-connect ollama pypdf pandas numpy +pip install ipywidgets tqdm scikit-learn pyarrow +``` + +2. Install and start Ollama: + +```bash +ollama pull nomic-embed-text +ollama pull llama3.2:3b +ollama serve +``` + +3. Configure ClickHouse credentials in your script or environment. + +4. Run: + +```bash +python rag_system_alternative.py +``` + +### Suggested Environment Variables (Local RAG) + +```bash +# ClickHouse +CLICKHOUSE_HOST=your-host.clickhouse.cloud +CLICKHOUSE_USER=default +CLICKHOUSE_PASSWORD=your-password + +# Ollama +EMBED_MODEL=nomic-embed-text +LLM_MODEL=llama3.2:3b + +# RAG tuning +CHUNK_SIZE=1000 +CHUNK_OVERLAP=150 +TOP_K=8 +NUM_CTX=4096 +NUM_PREDICT=400 +TEMPERATURE=0.1 +``` diff --git a/bash.exe.stackdump b/bash.exe.stackdump new file mode 100644 index 0000000000000000000000000000000000000000..5fe7e37ebdfa6556bd5c196661984446a2e3c969 --- /dev/null +++ b/bash.exe.stackdump @@ -0,0 +1,32 @@ +Stack trace: +Frame Function Args +000FFFEFF60 00210062F57 (00000000002, 00000000002, 00000000000, 000FFFFDE50) +00000000000 00210065045 (000FFFF0910, 00000000000, 00000000744, 00000000000) +000FFFF0670 0021013AB68 (00000000000, 00000000000, 00000000000, 00000000000) +000000000C1 0021013619B (00000000000, 00200000000, 00000000000, 00000000000) +00000000000 002101365A5 (00210199B0B, 000FFFFFFFF, 0000000000B, 00000080000) +00000000000 00210199B0B (00210199B0B, 000FFFFFFFF, 0000000000B, 00000080000) +00000000000 0010044E724 (00000000000, 00000000000, 00000000000, 00000000000) +00000000000 0010044E8A9 (00000000000, 00000000000, 00000000000, 000FFFFCE00) +00000000000 0010044EA42 (00000000210, 00000738888, 000006F6A90, 000FFFF0BC0) +00000000000 00210065045 (000FFFF1320, 00000000000, 00000000744, 00000000000) +000FFFF11B0 0021013AB68 (00000000000, 00000000000, 00000000030, 00000000000) +000FFFF1320 00210063243 (000FFFF1490, 002102D35D0, 002100477C4, 000FFFF19E0) +000FFFF1990 7FFC12C861CF (00210040000, 002100477C4, 002102D35D0, 000FFFF1490) +000FFFF1990 7FFC12B323A7 (000FFFF22BC, 00210373AA0, 00210132B05, 00000000000) +000FFFFC730 7FFC12C85B0E (00000000000, 00000000000, 00000000000, 00000000000) +000FFFFC730 002100C98E9 (000FFFFC550, 00600000000, 0000000045C, 00000000000) +000FFFFC730 002100CAFF0 (00000000000, 00210135DE8, 000FFFFC660, 00210273880) +000FFFFC730 002100CB871 (0000000080F, 00000000000, 0010042CFB0, 00210135DE8) +000FFFFC730 0021015AA22 (00000080000, 0010061D894, 00000000000, 00100623248) +00100624660 00210199B0B (00000080000, 0010061D894, 00000000000, 00100623248) +00100624660 0010042C67B (008001FF860, 008001ED060, 00210199B0B, 008001A9850) +00100624660 0010042EC67 (00000000000, 0010044E4A0, 00000000002, 008FFFFFFFF) +008001FF810 0010041A317 (00210199B0B, 0010044E4A0, 00000000002, 008001FF810) +00000000000 0010041ACA2 (19802D000500018, 00000000000, 00000000000, 00100623E68) +00000000000 001004024A5 (00800000002, 00000000001, 0000000002D, 00000000000) +00000000000 001004EA5C5 (000FFFFCC70, 00800000160, 00210049B25, 000006E0000) +000FFFFCD30 00210049B91 (00000000000, 00000000000, 00000000000, 00000000000) +000FFFFFFF0 00210047716 (00000000000, 00000000000, 00000000000, 00000000000) +000FFFFFFF0 002100477C4 (00000000000, 00000000000, 00000000000, 00000000000) +End of stack trace diff --git a/config.py b/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f33b2cfb755141dd22c4014170bf8fa36945c699 --- /dev/null +++ b/config.py @@ -0,0 +1,72 @@ +import os +from dataclasses import dataclass, field +from typing import List, Tuple +from dotenv import load_dotenv + +load_dotenv() + + +@dataclass +class Config: + # ClickHouse + ch_host: str = os.getenv('CLICKHOUSE_HOST', 'ug1o26imbr.eu-central-1.aws.clickhouse.cloud') + ch_user: str = os.getenv('CLICKHOUSE_USER', 'default') + ch_password: str = os.getenv('CLICKHOUSE_PASSWORD', '~MlK_g7KdbqYH') + ch_secure: bool = os.getenv('CLICKHOUSE_SECURE', 'true').lower() == 'true' + + # Ollama + embed_model: str = os.getenv('EMBED_MODEL', 'nomic-embed-text') + llm_model: str = os.getenv('LLM_MODEL', 'llama3.2:3b') + + # RAG + chunk_size: int = int(os.getenv('CHUNK_SIZE', '1000')) + chunk_overlap: int = int(os.getenv('CHUNK_OVERLAP', '150')) + top_k: int = int(os.getenv('TOP_K', '8')) + rerank_top_k: int = int(os.getenv('RERANK_TOP_K', '3')) + similarity_threshold: float = float(os.getenv('SIMILARITY_THRESHOLD', '0.35')) + batch_size: int = int(os.getenv('BATCH_SIZE', '32')) + + # Generation + num_ctx: int = int(os.getenv('NUM_CTX', '4096')) + num_predict: int = int(os.getenv('NUM_PREDICT', '400')) + temperature: float = float(os.getenv('TEMPERATURE', '0.1')) + top_p: float = float(os.getenv('TOP_P', '0.9')) + repeat_penalty: float = float(os.getenv('REPEAT_PENALTY', '1.1')) + + # Limits + max_text_length: int = int(os.getenv('MAX_TEXT_LENGTH', '3072')) + min_chunk_size: int = int(os.getenv('MIN_CHUNK_SIZE', '100')) + max_chunks_per_doc: int = int(os.getenv('MAX_CHUNKS_PER_DOC', '2000')) + + # Cache + cache_enabled: bool = os.getenv('CACHE_ENABLED', 'true').lower() == 'true' + cache_ttl: int = int(os.getenv('CACHE_TTL', '3600')) + + # Paths + docs_folder: str = './doc-2.0-sources' + few_shot_folder: str = './data/few_shot_examples' + results_folder: str = './data/results' + + doc_files: List[Tuple[str, str]] = field(default_factory=list) + + def __post_init__(self): + if os.path.exists(self.docs_folder): + for root, dirs, files in os.walk(self.docs_folder): + for file in files: + if file.endswith(('.txt', '.md')): + full_path = os.path.join(root, file) + rel_path = os.path.relpath(root, self.docs_folder) + if rel_path == '.': + source_name = file.replace('.txt', '').replace('.md', '') + else: + source_name = f"{rel_path}/{file}" + self.doc_files.append((full_path, source_name)) + + os.makedirs(self.few_shot_folder, exist_ok=True) + os.makedirs(self.results_folder, exist_ok=True) + + print(f"[DOCS] Folder: {self.docs_folder}") + print(f"[DOCS] Files found: {len(self.doc_files)}") + + +config = Config() \ No newline at end of file diff --git a/core/database.py b/core/database.py new file mode 100644 index 0000000000000000000000000000000000000000..191f31bbfeb2eb9e82610114caf18a367dbf8c78 --- /dev/null +++ b/core/database.py @@ -0,0 +1,124 @@ +import time +import json +from typing import List, Dict, Tuple +import clickhouse_connect +from config import config + + +class DatabaseManager: + def __init__(self): + self._client = None + self._cache = {} + self._cache_time = {} + + def get_client(self): + if self._client is None: + self._client = clickhouse_connect.get_client( + host=config.ch_host, + username=config.ch_user, + password=config.ch_password, + secure=config.ch_secure, + compress=True, + connect_timeout=30 + ) + print(f"[OK] Connected to ClickHouse") + return self._client + + def init_database(self, force_recreate: bool = False): + """Инициализирует базу данных + + Args: + force_recreate: Если True - пересоздаёт таблицу (удаляет все данные) + Если False - создаёт таблицу только если её нет + """ + client = self.get_client() + + # Проверяем существование таблицы + try: + result = client.query("EXISTS TABLE default.rag_chunks") + table_exists = result.result_rows[0][0] if result.result_rows else False + except: + table_exists = False + + if table_exists and not force_recreate: + print("[OK] Database already exists, reusing existing data") + # Проверяем количество чанков + try: + count_result = client.query("SELECT count(*) FROM default.rag_chunks") + chunk_count = count_result.result_rows[0][0] if count_result.result_rows else 0 + print(f"[OK] Existing chunks in database: {chunk_count}") + except: + pass + return + + # Если таблица не существует или force_recreate=True + if force_recreate: + print("[WARN] Force recreating database...") + client.command("DROP TABLE IF EXISTS default.rag_chunks") + + client.command(""" + CREATE TABLE IF NOT EXISTS default.rag_chunks ( + id UInt64, + source String, + page UInt32, + chunk String, + embedding Array(Float32), + chunk_hash String, + char_count UInt32, + created_at DateTime DEFAULT now() + ) ENGINE = MergeTree() + PARTITION BY source + ORDER BY id + """) + print("[OK] Database initialized") + + def insert_batch(self, chunks: List[Dict]): + if not chunks: + return + client = self.get_client() + rows = [[c['id'], c['source'], c['page'], c['chunk'], + c['embedding'], c['chunk_hash'], c['char_count']] for c in chunks] + client.insert('default.rag_chunks', rows, + column_names=['id', 'source', 'page', 'chunk', 'embedding', 'chunk_hash', 'char_count']) + print(f" [OK] Inserted {len(chunks)} chunks") + + def get_chunk_count(self) -> int: + """Возвращает количество чанков в базе""" + try: + client = self.get_client() + result = client.query("SELECT count(*) FROM default.rag_chunks") + return result.result_rows[0][0] if result.result_rows else 0 + except: + return 0 + + def search(self, embedding: List[float]) -> List[tuple]: + client = self.get_client() + query = """ + SELECT chunk, source, page, cosineDistance(embedding, %(emb)s) AS distance + FROM default.rag_chunks + WHERE distance < %(threshold)s + ORDER BY distance ASC + LIMIT %(top_k)s + """ + result = client.query(query, parameters={ + 'emb': embedding, + 'threshold': config.similarity_threshold, + 'top_k': config.top_k + }) + return result.result_rows + + def get_cache(self, key: str): + if not config.cache_enabled: + return None + if key in self._cache: + if time.time() - self._cache_time.get(key, 0) < config.cache_ttl: + return self._cache[key] + return None + + def set_cache(self, key: str, value: str): + if config.cache_enabled: + self._cache[key] = value + self._cache_time[key] = time.time() + + +db = DatabaseManager() \ No newline at end of file diff --git a/core/document_processor.py b/core/document_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..c2bcd7e657edc7e3290da61346b49ed5c23c8f33 --- /dev/null +++ b/core/document_processor.py @@ -0,0 +1,93 @@ +import os +import re +import hashlib +from typing import List, Dict, Tuple +from config import config +from core.embeddings import embedder + + +class DocumentProcessor: + @staticmethod + def load_document(file_path: str, source_name: str) -> List[Tuple[int, str]]: + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + chunks = [] + if len(content) > config.chunk_size: + parts = [] + current = [] + current_len = 0 + + for line in content.split('\n'): + if current_len + len(line) > config.chunk_size: + parts.append('\n'.join(current)) + current = [line] + current_len = len(line) + else: + current.append(line) + current_len += len(line) + + if current: + parts.append('\n'.join(current)) + + for i, part in enumerate(parts): + if len(part.strip()) > config.min_chunk_size: + chunks.append((i + 1, part.strip())) + else: + if len(content.strip()) > config.min_chunk_size: + chunks.append((1, content.strip())) + + return chunks + except Exception as e: + print(f" Error loading {file_path}: {e}") + return [] + + @staticmethod + def split_chunks(text: str) -> List[str]: + size = config.chunk_size + words = text.split() + chunks = [] + step = size - config.chunk_overlap + + for i in range(0, len(words), step): + chunk = ' '.join(words[i:i+size]) + if len(chunk) > config.min_chunk_size: + chunks.append(chunk) + if len(chunks) >= config.max_chunks_per_doc: + break + return chunks + + @staticmethod + def process_document(file_path: str, source_name: str, start_id: int) -> List[Dict]: + print(f"\n Processing: {source_name}") + pages = DocumentProcessor.load_document(file_path, source_name) + + if not pages: + return [] + + chunks = [] + for page_num, text in pages: + text_chunks = DocumentProcessor.split_chunks(text) + + for chunk in text_chunks: + if len(chunk) > config.max_text_length: + chunk = chunk[:config.max_text_length] + chunks.append({ + 'id': start_id + len(chunks), + 'source': source_name, + 'page': page_num, + 'chunk': chunk, + 'chunk_hash': hashlib.md5(chunk.encode()).hexdigest(), + 'char_count': len(chunk) + }) + if len(chunks) >= config.max_chunks_per_doc: + break + if len(chunks) >= config.max_chunks_per_doc: + break + + print(f" Created {len(chunks)} chunks from {source_name}") + return chunks + + +doc_processor = DocumentProcessor() \ No newline at end of file diff --git a/core/embeddings.py b/core/embeddings.py new file mode 100644 index 0000000000000000000000000000000000000000..f71f67110239fb5b0560eeb0aa368ed5522662fd --- /dev/null +++ b/core/embeddings.py @@ -0,0 +1,48 @@ +import hashlib +from typing import List +from functools import lru_cache +import ollama +from config import config + + +class EmbeddingGenerator: + def __init__(self): + self.model = config.embed_model + self.batch_size = config.batch_size + self.max_length = config.max_text_length + + def _truncate_text(self, text: str) -> str: + if len(text) <= self.max_length: + return text + truncated = text[:self.max_length] + last_period = truncated.rfind('.') + if last_period > self.max_length // 2: + truncated = truncated[:last_period + 1] + return truncated.strip() + + def generate_batch(self, texts: List[str]) -> List[List[float]]: + if not texts: + return [] + safe_texts = [self._truncate_text(t) for t in texts] + try: + response = ollama.embed(model=self.model, input=safe_texts) + return response['embeddings'] + except Exception as e: + print(f" Embedding error: {e}") + return [[0.0] * 768 for _ in safe_texts] + + def generate(self, texts: List[str]) -> List[List[float]]: + all_embeddings = [] + for i in range(0, len(texts), self.batch_size): + batch = texts[i:i+self.batch_size] + embeddings = self.generate_batch(batch) + all_embeddings.extend(embeddings) + return all_embeddings + + @lru_cache(maxsize=256) + def generate_cached(self, text: str) -> tuple: + embedding = self.generate_batch([text])[0] + return tuple(embedding) + + +embedder = EmbeddingGenerator() \ No newline at end of file diff --git a/core/init.py b/core/init.py new file mode 100644 index 0000000000000000000000000000000000000000..5dc0fb4e31179a845dd5892bdf734dd70ab1a8be --- /dev/null +++ b/core/init.py @@ -0,0 +1,4 @@ +from core.database import DatabaseManager +from core.embeddings import EmbeddingGenerator +from core.document_processor import DocumentProcessor +from core.reranker import Reranker \ No newline at end of file diff --git a/core/pdf_processor.py b/core/pdf_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..9941e85d0a028c1bae010083ad1eb9ef98eed518 --- /dev/null +++ b/core/pdf_processor.py @@ -0,0 +1,77 @@ +import re +import hashlib +from typing import List, Dict, Tuple +from pypdf import PdfReader +from config import config +from core.embeddings import embedder + + +class PDFProcessor: + @staticmethod + def extract_pdf(pdf_path: str, source_name: str) -> List[Tuple[int, str]]: + try: + reader = PdfReader(pdf_path) + total_pages = len(reader.pages) + print(f" Total pages: {total_pages}") + + pages = [] + for i in range(total_pages): + try: + page = reader.pages[i] + text = page.extract_text() + if text and len(text.strip()) > config.min_chunk_size: + text = re.sub(r'\n+', ' ', text) + pages.append((i + 1, text.strip())) + except: + pass + return pages + except Exception as e: + print(f" Error: {e}") + return [] + + @staticmethod + def split_chunks(text: str) -> List[str]: + size = config.chunk_size + words = text.split() + chunks = [] + step = size - config.chunk_overlap + + for i in range(0, len(words), step): + chunk = ' '.join(words[i:i+size]) + if len(chunk) > config.min_chunk_size: + chunks.append(chunk) + if len(chunks) >= config.max_chunks_per_doc: + break + return chunks + + @staticmethod + def process_document(pdf_path: str, source_name: str, start_id: int) -> List[Dict]: + print(f"\n Processing: {source_name}") + pages = PDFProcessor.extract_pdf(pdf_path, source_name) + + if not pages: + return [] + + chunks = [] + for page_num, text in pages: + for chunk in PDFProcessor.split_chunks(text): + if len(chunk) > config.max_text_length: + chunk = chunk[:config.max_text_length] + chunks.append({ + 'id': start_id + len(chunks), + 'source': source_name, + 'page': page_num, + 'chunk': chunk, + 'chunk_hash': hashlib.md5(chunk.encode()).hexdigest(), + 'char_count': len(chunk) + }) + if len(chunks) >= config.max_chunks_per_doc: + break + if len(chunks) >= config.max_chunks_per_doc: + break + + print(f" Created {len(chunks)} chunks") + return chunks + + +pdf_processor = PDFProcessor() \ No newline at end of file diff --git a/core/reranker.py b/core/reranker.py new file mode 100644 index 0000000000000000000000000000000000000000..dcba5481f4765c1d51672aba2436f567469f3580 --- /dev/null +++ b/core/reranker.py @@ -0,0 +1,27 @@ +import re +from typing import List, Tuple +from config import config + + +class Reranker: + @staticmethod + def rerank(question: str, results: List[tuple]) -> List[tuple]: + if not results: + return results + + q_words = set(re.findall(r'\b\w{4,}\b', question.lower())) + + scored = [] + for result in results: + chunk, source, page, distance = result + c_words = set(re.findall(r'\b\w{4,}\b', chunk.lower())) + overlap = len(q_words & c_words) / max(len(q_words), 1) + similarity = 1 - distance + final_score = similarity * 0.6 + overlap * 0.4 + scored.append((final_score, result)) + + scored.sort(key=lambda x: x[0], reverse=True) + return [r for _, r in scored[:config.rerank_top_k]] + + +reranker = Reranker() \ No newline at end of file diff --git a/evaluator/folder_scanner.py b/evaluator/folder_scanner.py new file mode 100644 index 0000000000000000000000000000000000000000..99252f6a87aaa286fa52d86927689db09512249f --- /dev/null +++ b/evaluator/folder_scanner.py @@ -0,0 +1,59 @@ +import os +from pathlib import Path +from typing import List, Dict + + +class FolderScanner: + def __init__(self, root_path: str): + self.root_path = Path(root_path) + + def scan(self) -> List[Dict]: + results = [] + if not self.root_path.exists(): + return results + + for root, dirs, files in os.walk(self.root_path): + root_path = Path(root) + questions_file = None + answers_file = None + + for file in files: + file_lower = file.lower() + if 'question' in file_lower or file_lower == 'q.txt': + questions_file = root_path / file + if 'answer' in file_lower or file_lower == 'a.txt': + answers_file = root_path / file + + if questions_file and answers_file: + folder_name = str(root_path.relative_to(self.root_path)) + if folder_name == '.': + folder_name = 'root' + + results.append({ + 'folder_path': str(root_path), + 'folder_name': folder_name, + 'questions_file': str(questions_file), + 'answers_file': str(answers_file), + 'questions_count': count_records(questions_file), + 'answers_count': count_records(answers_file) + }) + + return results + + +def count_records(file_path: Path) -> int: + try: + if file_path.suffix == '.txt': + with open(file_path, 'r', encoding='utf-8') as f: + return len([line for line in f if line.strip()]) + elif file_path.suffix == '.csv': + import pandas as pd + return len(pd.read_csv(file_path)) + elif file_path.suffix == '.json': + import json + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + return len(data) if isinstance(data, list) else 1 + except: + pass + return 0 \ No newline at end of file diff --git a/evaluator/init.py b/evaluator/init.py new file mode 100644 index 0000000000000000000000000000000000000000..cd55c15ed96958f1dd9db197bdbdc335e3df4347 --- /dev/null +++ b/evaluator/init.py @@ -0,0 +1,3 @@ +from evaluator.qa_loader import QALoader +from evaluator.folder_scanner import FolderScanner +from evaluator.results import ResultsAnalyzer \ No newline at end of file diff --git a/evaluator/qa_loader.py b/evaluator/qa_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..ed907f58d4f1edb16a8b7ca547313648cd1db648 --- /dev/null +++ b/evaluator/qa_loader.py @@ -0,0 +1,60 @@ +import json +import pandas as pd +from pathlib import Path +from typing import List, Tuple + + +class QALoader: + @staticmethod + def load_questions(file_path: str) -> List[str]: + path = Path(file_path) + + if path.suffix == '.txt': + with open(file_path, 'r', encoding='utf-8') as f: + return [line.strip() for line in f if line.strip()] + + elif path.suffix == '.csv': + df = pd.read_csv(file_path) + for col in df.columns: + if 'question' in col.lower() or 'query' in col.lower(): + return df[col].dropna().astype(str).tolist() + return df.iloc[:, 0].dropna().astype(str).tolist() + + elif path.suffix == '.json': + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + if isinstance(data, list): + return [item.get('question', str(item)) for item in data] + + return [] + + @staticmethod + def load_answers(file_path: str) -> List[str]: + path = Path(file_path) + + if path.suffix == '.txt': + with open(file_path, 'r', encoding='utf-8') as f: + return [line.strip() for line in f if line.strip()] + + elif path.suffix == '.csv': + df = pd.read_csv(file_path) + for col in df.columns: + if 'answer' in col.lower() or 'response' in col.lower(): + return df[col].dropna().astype(str).tolist() + return df.iloc[:, 0].dropna().astype(str).tolist() + + elif path.suffix == '.json': + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + if isinstance(data, list): + return [item.get('answer', str(item)) for item in data] + + return [] + + @staticmethod + def load_qa_pairs(questions_file: str, answers_file: str) -> List[Tuple[str, str]]: + questions = QALoader.load_questions(questions_file) + answers = QALoader.load_answers(answers_file) + + min_len = min(len(questions), len(answers)) + return list(zip(questions[:min_len], answers[:min_len])) \ No newline at end of file diff --git a/evaluator/results.py b/evaluator/results.py new file mode 100644 index 0000000000000000000000000000000000000000..5a9504aef665a065cee84dab546568665c14f940 --- /dev/null +++ b/evaluator/results.py @@ -0,0 +1,14 @@ +import os +import pandas as pd +from config import config + + +class ResultsAnalyzer: + @staticmethod + def save(results: list, filename: str = "evaluation_results.csv") -> pd.DataFrame: + os.makedirs(config.results_folder, exist_ok=True) + df = pd.DataFrame(results) + path = os.path.join(config.results_folder, filename) + df.to_csv(path, index=False, encoding='utf-8') + print(f" Results saved to: {path}") + return df \ No newline at end of file diff --git a/prompts/rag_api_en.txt b/prompts/rag_api_en.txt new file mode 100644 index 0000000000000000000000000000000000000000..6670be8083a608359a9078be96f8d8ace2b1810e --- /dev/null +++ b/prompts/rag_api_en.txt @@ -0,0 +1,29 @@ +The client wants to receive general describing information about API interface. Your task is to +provide well-structured and detailed information about API to the client, which will give an initial understanding +of how the API is used. Below is a collection of information found in the documentation to answer a customer request, +surrounded by --------------------- + +--------------------- +{context} +--------------------- + +This information is a set of blocks, divided into headers in .md format with the following information. + +Customer query: {query} + +Your task is to respond to the client's request as follows: + +1. Select information blocks that, based on the title and content, best match the API name that the client +sent in the request. Generally, there are two types of API - non-form API and form API. If in the customer query you +see any mentions form or forms, please select for answer only information blocks in the headers of which or +in content of which there are mentions form or forms. If in the customer query you don`t see any mentions form or forms, +please select for answer only information blocks where you don`t see any mentions form or forms. + +2. From the selected blocks of information, create an answer that has the following structure: +first a brief introduction with a description of what the API is used for, then - flow for this api, step by step as it is if +it was found, then the URLs that are used for this API with a mention of what environment they belong to - sandbox or +production, then you can give a brief set of basic request parameters with their short description and a brief set of +basic response parameters with their short description, if they are in the selected information. Then, you can provide +one example of the request and response used in this API. + +3. In your answer, you do not need to describe the names of the information blocks that were selected for the answer. diff --git a/prompts/rag_api_parameter_en.txt b/prompts/rag_api_parameter_en.txt new file mode 100644 index 0000000000000000000000000000000000000000..362c00a73bc9cecd3d9cc893fddb1d3e606fbe66 --- /dev/null +++ b/prompts/rag_api_parameter_en.txt @@ -0,0 +1,35 @@ +The client wants to receive information about parameter or parameters specified in query. Your task is to +provide well-structured and detailed information about parameter or mentioned parameters to the client. +Below is a collection of information found in the documentation to answer a customer request, +surrounded by --------------------- + +--------------------- +{context} +--------------------- + +This information is a set of blocks, divided into headers in .md format with the following information. + +Customer query: {query} + +Your task is to respond to the customer query as follows: + +Usually, customer asks about one single parameter or difference between the parameters specified in the request. + +If customer query is about one single parameter, do the following: + +It is necessary to find information only on the parameter specified in the request and give the most +detailed answer and complete information only on this specific parameter. Information about any other parameters +should not be included into response to customer. Following information should be included into response: + +1. Parameter name strictly as it is +2. Parameter description strictly as it is. +3. Information about parameter value characteristics: necessity, type, length etc. +4. Also, you may mention apis and requests where this parameter is used. Some useful notes about this parameter +also can be added to response if found. + +If a customer query is about several parameters, do the following: + +1. Find information about each mentioned parameter, as it described above for one single parameter request. +2. Include the information found for each parameter in the answer. +3. If a customer query contains a mention of any difference or differences between parameters analyze how the +specified parameters differ from each other and include a summary of their differences in your answer. diff --git a/prompts/rag_api_parameters_list_en.txt b/prompts/rag_api_parameters_list_en.txt new file mode 100644 index 0000000000000000000000000000000000000000..84eb013ab0f51fbb17036c38e7696b8e60fbf9b7 --- /dev/null +++ b/prompts/rag_api_parameters_list_en.txt @@ -0,0 +1,21 @@ +The client wants to receive information about the parameter list or set. Your task is to +provide well-structured and beautifully presented information to the client. +Below is a collection of information found in the documentation to answer a customer request, +surrounded by --------------------- + +--------------------- +{context} +--------------------- + +This information is a set of blocks, divided into headers in .md format with the following information. + +Customer query: {query} + +Your task is to respond to the client's request as follows: + +1. If the question is about request parameters - select the part of the information that concerns the request parameters. +If the question is about response parameters - select the part of the information that concerns the response parameters. +If it is not specified what parameters the query is about - select the entire part of the information that concerns the +parameters. + +2. Present parameters as a detailed list with their description and characteristics. diff --git a/rag-system.ipynb b/rag-system.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..d69b9cf12a678789a773a4470faaf2bdd1a60dc3 --- /dev/null +++ b/rag-system.ipynb @@ -0,0 +1,1101 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 15, + "id": "d8543c46", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " WARNING: Failed to remove contents in a temporary directory 'C:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\~-mpy.libs'.\n", + " You can safely remove it manually.\n", + " WARNING: Failed to remove contents in a temporary directory 'C:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\~-mpy'.\n", + " You can safely remove it manually.\n", + "ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "langchain 0.3.0 requires numpy<2,>=1; python_version < \"3.12\", but you have numpy 2.2.6 which is incompatible.\n", + "langchain-community 0.3.0 requires numpy<2,>=1; python_version < \"3.12\", but you have numpy 2.2.6 which is incompatible.\n", + "typer 0.24.1 requires click>=8.2.1, but you have click 8.1.8 which is incompatible.\n" + ] + } + ], + "source": [ + "pip install pywidgets -q" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9eee0f71", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ ClickHouse подключён!\n" + ] + } + ], + "source": [ + "import ollama\n", + "import clickhouse_connect\n", + "import pandas as pd\n", + "import json, time, re\n", + "from pypdf import PdfReader\n", + "from IPython.display import display, HTML, clear_output\n", + "import ipywidgets as widgets\n", + "\n", + "# ── Настройки ────────────────────────────────────────────\n", + "CH_HOST = 'ug1o26imbr.eu-central-1.aws.clickhouse.cloud'\n", + "CH_USER = 'default'\n", + "CH_PASSWORD = '~MlK_g7KdbqYH' # ← вставьте реальный пароль\n", + "\n", + "EMBED_MODEL = 'nomic-embed-text'\n", + "LLM_MODEL = 'llama3.1'\n", + "\n", + "PDF_FILES = [\n", + " (r'C:\\Users\\User\\Desktop\\Folder_vs_documents\\integration.pdf', 'Integration'),\n", + " (r'C:\\Users\\User\\Desktop\\Folder_vs_documents\\manager.pdf', 'Manager'),\n", + " (r'C:\\Users\\User\\Desktop\\Folder_vs_documents\\merchant.pdf', 'Merchant'),\n", + "]\n", + "\n", + "QUESTIONS_CSV = r'C:\\Users\\User\\Desktop\\Folder_vs_documents\\questions.csv' # ← путь к CSV с вопросами\n", + "QUESTION_COL = 'question' # ← название колонки с вопросами\n", + "\n", + "# ── Подключение ──────────────────────────────────────────\n", + "# ── Подключение к ClickHouse ──────────────────────────────\n", + "client = clickhouse_connect.get_client(\n", + " host='ug1o26imbr.eu-central-1.aws.clickhouse.cloud',\n", + " username='default',\n", + " password='~MlK_g7KdbqYH',\n", + " secure=True\n", + ")\n", + "print(\"✅ ClickHouse подключён!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee60aa34", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Все библиотеки загружены!\n" + ] + } + ], + "source": [ + "import ollama\n", + "import clickhouse_connect\n", + "import pandas as pd\n", + "import json, time, re\n", + "from pypdf import PdfReader\n", + "from IPython.display import display, HTML, clear_output\n", + "import ipywidgets as widgets\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "7df65af0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Таблица rag_chunks готова\n" + ] + } + ], + "source": [ + "client.command(\"\"\"\n", + "CREATE TABLE IF NOT EXISTS default.rag_chunks (\n", + " id UInt64,\n", + " source String,\n", + " page UInt32,\n", + " chunk String,\n", + " embedding Array(Float32)\n", + ") ENGINE = MergeTree()\n", + "ORDER BY id\n", + "\"\"\")\n", + "\n", + "print(\"✅ Таблица rag_chunks готова\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "ea4f0913", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Функции готовы\n" + ] + } + ], + "source": [ + "def extract_pdf(pdf_path):\n", + " \"\"\"Извлечь текст по страницам из PDF\"\"\"\n", + " reader = PdfReader(pdf_path)\n", + " pages = []\n", + " for i, page in enumerate(reader.pages):\n", + " text = page.extract_text()\n", + " if text and len(text.strip()) > 30:\n", + " pages.append((i + 1, text.strip()))\n", + " return pages\n", + "\n", + "def split_chunks(text, size=100, overlap=10):\n", + " \"\"\"Разбить текст на перекрывающиеся чанки\"\"\"\n", + " words = text.split()\n", + " chunks = []\n", + " step = size - overlap\n", + " for i in range(0, len(words), step):\n", + " chunk = ' '.join(words[i:i + size])\n", + " if len(chunk) > 50:\n", + " chunks.append(chunk)\n", + " return chunks\n", + "\n", + "def get_embedding(text):\n", + " \"\"\"Получить вектор через Ollama\"\"\"\n", + " resp = ollama.embeddings(model=EMBED_MODEL, prompt=text)\n", + " return list(resp['embedding'])\n", + "\n", + "print(\"✅ Функции готовы\")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "be8a3dc3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🗑️ Таблица очищена\n" + ] + } + ], + "source": [ + "client.command(\"TRUNCATE TABLE default.rag_chunks\")\n", + "print(\"🗑️ Таблица очищена\")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "2ca29917", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "📄 Обрабатываю: Integration\n", + " Страниц: 1154\n", + " ✅ Загружено чанков: 2833\n", + "\n", + "📄 Обрабатываю: Manager\n", + " Страниц: 564\n", + " ✅ Загружено чанков: 1350\n", + "\n", + "📄 Обрабатываю: Merchant\n", + " Страниц: 109\n", + " ✅ Загружено чанков: 205\n", + "\n", + "🎉 Итого загружено: 4388 чанков\n" + ] + } + ], + "source": [ + "chunk_id = 0\n", + "total_chunks = 0\n", + "\n", + "for pdf_path, source_name in PDF_FILES:\n", + " print(f\"\\n📄 Обрабатываю: {source_name}\")\n", + " pages = extract_pdf(pdf_path)\n", + " print(f\" Страниц: {len(pages)}\")\n", + "\n", + " rows = []\n", + " for page_num, text in pages:\n", + " chunks = split_chunks(text)\n", + " for chunk in chunks:\n", + " emb = get_embedding(chunk)\n", + " rows.append([chunk_id, source_name, page_num, chunk, emb])\n", + " chunk_id += 1\n", + "\n", + " client.insert(\n", + " 'default.rag_chunks',\n", + " rows,\n", + " column_names=['id', 'source', 'page', 'chunk', 'embedding']\n", + " )\n", + " total_chunks += len(rows)\n", + " print(f\" ✅ Загружено чанков: {len(rows)}\")\n", + "\n", + "print(f\"\\n🎉 Итого загружено: {total_chunks} чанков\")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "a3b15e25", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⏱️ Время: 24.57s\n", + "🤖 Информация не найдена в документах.\n" + ] + } + ], + "source": [ + "def search_context(question, top_k=5):\n", + " \"\"\"Найти похожие чанки через cosineDistance\"\"\"\n", + " q_emb = get_embedding(question)\n", + " q_str = '[' + ','.join(map(str, q_emb)) + ']'\n", + "\n", + " res = client.query(f\"\"\"\n", + " SELECT chunk, source, page,\n", + " cosineDistance(embedding, {q_str}) AS dist\n", + " FROM default.rag_chunks\n", + " ORDER BY dist ASC\n", + " LIMIT {top_k}\n", + " \"\"\")\n", + " return res.result_rows\n", + "\n", + "def ask_rag(question, top_k=5):\n", + " \"\"\"Полный RAG пайплайн: вопрос → контекст → ответ\"\"\"\n", + " t_start = time.time()\n", + "\n", + " # 1. Найти контекст\n", + " ctx_rows = search_context(question, top_k)\n", + " context = \"\\n\\n\".join([\n", + " f\"[{r[1]}, стр.{r[2]}] {r[0]}\"\n", + " for r in ctx_rows\n", + " ])\n", + " t_retrieve = time.time() - t_start\n", + "\n", + " # 2. Спросить Llama\n", + " messages = [\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": (\n", + " \"Ты — помощник по технической документации. \"\n", + " \"Отвечай ТОЛЬКО на основе предоставленного контекста. \"\n", + " \"Если ответ не найден — скажи 'Информация не найдена в документах'. \"\n", + " \"Отвечай кратко и по делу.\"\n", + " )\n", + " },\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": f\"Контекст:\\n{context}\\n\\nВопрос: {question}\"\n", + " }\n", + " ]\n", + " resp = ollama.chat(model=LLM_MODEL, messages=messages)\n", + " t_total = time.time() - t_start\n", + "\n", + " return {\n", + " 'question' : question,\n", + " 'answer' : resp.message.content,\n", + " 'sources' : [(r[1], r[2]) for r in ctx_rows],\n", + " 'time_retrieve': round(t_retrieve, 2),\n", + " 'time_total' : round(t_total, 2),\n", + " }\n", + "\n", + "# Быстрый тест:\n", + "test = ask_rag(\"Тестовый вопрос?\")\n", + "print(f\"⏱️ Время: {test['time_total']}s\\n🤖 {test['answer'][:200]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "b98b3ba8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📋 Вопросов: 59\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5ff9ce52f8c7414baa9fcc62c9c9ac32", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(IntProgress(value=0, bar_style='info', description='Прогресс:', layout=Layout(width='70%'), max…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "📊 Результаты:\n", + " Вопросов обработано : 59/59\n", + " Среднее время ответа: 28.4 сек\n", + " Сохранено в : benchmark_results.csv\n" + ] + } + ], + "source": [ + "df_q = pd.read_csv(QUESTIONS_CSV)\n", + "questions = df_q[QUESTION_COL].dropna().tolist()\n", + "print(f\"📋 Вопросов: {len(questions)}\")\n", + "\n", + "# Прогресс-бар\n", + "progress = widgets.IntProgress(\n", + " value=0, min=0, max=len(questions),\n", + " description='Прогресс:',\n", + " bar_style='info',\n", + " layout=widgets.Layout(width='70%')\n", + ")\n", + "status_lbl = widgets.Label(value='Ожидание...')\n", + "display(widgets.VBox([progress, status_lbl]))\n", + "\n", + "# Запуск бенчмарка\n", + "results = []\n", + "for i, q in enumerate(questions):\n", + " status_lbl.value = f\"[{i+1}/{len(questions)}] {q[:70]}...\"\n", + " try:\n", + " res = ask_rag(q)\n", + " results.append({\n", + " '№' : i + 1,\n", + " 'question' : q,\n", + " 'answer' : res['answer'],\n", + " 'sources' : str(res['sources']),\n", + " 'time_retrieve': res['time_retrieve'],\n", + " 'time_total' : res['time_total'],\n", + " 'status' : 'ok'\n", + " })\n", + " except Exception as e:\n", + " results.append({\n", + " '№': i+1, 'question': q, 'answer': f'ОШИБКА: {e}',\n", + " 'sources':'', 'time_retrieve':0, 'time_total':0, 'status':'error'\n", + " })\n", + " progress.value = i + 1\n", + "\n", + "status_lbl.value = '✅ Готово!'\n", + "\n", + "# Сохранить результаты\n", + "df_res = pd.DataFrame(results)\n", + "df_res.to_csv('benchmark_results.csv', index=False, encoding='utf-8-sig')\n", + "\n", + "avg_time = df_res['time_total'].mean()\n", + "ok_count = (df_res['status'] == 'ok').sum()\n", + "print(f\"\\n📊 Результаты:\")\n", + "print(f\" Вопросов обработано : {ok_count}/{len(questions)}\")\n", + "print(f\" Среднее время ответа: {avg_time:.1f} сек\")\n", + "print(f\" Сохранено в : benchmark_results.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "770088f4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏆 Запуск LLM-судьи...\n", + " [1] Оценка: ⭐⭐ (2/5)\n", + " [2] Оценка: ⭐⭐ (2/5)\n", + " [3] Оценка: ⭐⭐ (2/5)\n", + " [4] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [5] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [6] Оценка: ⭐⭐⭐ (3/5)\n", + " [7] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [8] Оценка: ⭐ (1/5)\n", + " [9] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [10] Оценка: ⭐⭐ (2/5)\n", + " [11] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [12] Оценка: ⭐ (1/5)\n", + " [13] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [14] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [15] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [16] Оценка: ⭐⭐⭐ (3/5)\n", + " [17] Оценка: ⭐⭐⭐ (3/5)\n", + " [18] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [19] Оценка: ⭐⭐ (2/5)\n", + " [20] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [21] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [22] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [23] Оценка: ⭐⭐ (2/5)\n", + " [24] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [25] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [26] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [27] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [28] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [29] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [30] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [31] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [32] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [33] Оценка: ⭐⭐ (2/5)\n", + " [34] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [35] Оценка: ⭐⭐⭐ (3/5)\n", + " [36] Оценка: ⭐⭐⭐ (3/5)\n", + " [37] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [38] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [39] Оценка: ⭐⭐ (2/5)\n", + " [40] Оценка: ⭐ (1/5)\n", + " [41] Оценка: ⭐⭐⭐ (3/5)\n", + " [42] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [43] Оценка: ⭐ (1/5)\n", + " [44] Оценка: ⭐⭐⭐ (3/5)\n", + " [45] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [46] Оценка: ⭐⭐⭐ (3/5)\n", + " [47] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [48] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [49] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [50] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [51] Оценка: ⭐⭐ (2/5)\n", + " [52] Оценка: ⭐⭐ (2/5)\n", + " [53] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [54] Оценка: ⭐ (1/5)\n", + " [55] Оценка: ⭐⭐ (2/5)\n", + " [56] Оценка: ⭐⭐ (2/5)\n", + " [57] Оценка: ⭐⭐⭐⭐ (4/5)\n", + " [58] Оценка: ⭐ (1/5)\n", + " [59] Оценка: ⭐⭐ (2/5)\n", + "\n", + "✅ Средняя оценка: 3.12 / 5\n" + ] + } + ], + "source": [ + "def judge_answer(question, answer):\n", + " \"\"\"Llama оценивает качество своего же ответа от 1 до 5\"\"\"\n", + " prompt = f\"\"\"Оцени качество ответа на вопрос по шкале от 1 до 5.\n", + "\n", + "Вопрос: {question}\n", + "Ответ: {answer}\n", + "\n", + "Критерии оценки:\n", + "5 — полный, точный, понятный ответ\n", + "4 — хороший ответ с незначительными пробелами\n", + "3 — частично правильный ответ\n", + "2 — слабый ответ, мало полезной информации\n", + "1 — нет ответа или 'информация не найдена'\n", + "\n", + "Ответь ТОЛЬКО одной цифрой от 1 до 5.\"\"\"\n", + "\n", + " resp = ollama.chat(\n", + " model=LLM_MODEL,\n", + " messages=[{\"role\": \"user\", \"content\": prompt}]\n", + " )\n", + " text = resp.message.content.strip()\n", + " match = re.search(r'[1-5]', text)\n", + " return int(match.group()) if match else 0\n", + "\n", + "# Оценить все ответы\n", + "print(\"🏆 Запуск LLM-судьи...\")\n", + "scores = []\n", + "for i, row in df_res.iterrows():\n", + " score = judge_answer(row['question'], row['answer'])\n", + " scores.append(score)\n", + " print(f\" [{i+1}] Оценка: {'⭐'*score} ({score}/5)\")\n", + "\n", + "df_res['score'] = scores\n", + "df_res.to_csv('benchmark_results.csv', index=False, encoding='utf-8-sig')\n", + "print(f\"\\n✅ Средняя оценка: {sum(scores)/len(scores):.2f} / 5\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "1bcfd0ee", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
| # | \n", + "Вопрос | \n", + "Ответ | \n", + "Оценка | \n", + "Время | \n", + "
|---|---|---|---|---|
| 1 | \n", + "Which tables, materialized views, and queries do I need to fully cover the analytics pipeline for pa | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐ | \n", + "25.5s | \n", + "
| 2 | \n", + "Provide an integration plan for loading payment and transaction data into ClickHouse | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐ | \n", + "24.86s | \n", + "
| 3 | \n", + "Create a step-by-step guide: how to ingest data from application logs or APIs into ClickHouse | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐ | \n", + "26.11s | \n", + "
| 4 | \n", + "How to test the integration (schema validation, data quality, idempotency, replay)? | \n", + "Для проверки интеграции следует выполнить тестовую транзакцию из Виртуального Терминала и убедиться, что все параметры подключены корректно.... | \n", + "⭐⭐⭐⭐ | \n", + "32.02s | \n", + "
| 5 | \n", + "What is the difference between a table and a materialized view in ClickHouse? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "25.65s | \n", + "
| 6 | \n", + "Should I use separate tables per currency or one fact table with a currency dimension? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐ | \n", + "25.54s | \n", + "
| 7 | \n", + "How do I check ingestion status and the last successful batch? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "25.14s | \n", + "
| 8 | \n", + "Which ingestion or ETL job statuses should we model and store? | \n", + "Информация не найдена в документах.... | \n", + "⭐ | \n", + "26.58s | \n", + "
| 9 | \n", + "What is a Connecting Party in our data model (source owner vs analytics consumer)? | \n", + "Connecting Party в нашем модели представляет Merchant и ID назначенный внешним процессором, если транзакция обрабатывалась им.... | \n", + "⭐⭐⭐⭐ | \n", + "29.66s | \n", + "
| 10 | \n", + "What is a Payment Gateway as a dimension or entity in the warehouse? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐ | \n", + "27.08s | \n", + "
| 11 | \n", + "What is a table in ClickHouse, and when is it fact vs dimension? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "27.46s | \n", + "
| 12 | \n", + "What is a materialized view and when should I prefer it over a plain table? | \n", + "Информация не найдена в документах.... | \n", + "⭐ | \n", + "25.32s | \n", + "
| 13 | \n", + "What is a merchant control key, and may it appear in stored rows or only in secrets? | \n", + "Merchant Control Key - это ключ, который присваивается аккаунту Connecting Party в системе Doc2.0 Gateway. Он не хранится в таблицах, а используется к... | \n", + "⭐⭐⭐⭐ | \n", + "37.94s | \n", + "
| 14 | \n", + "How to sign HTTP requests to ClickHouse HTTP API or to external loaders using OAuth RSA-SHA256? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "26.89s | \n", + "
| 15 | \n", + "How to generate and securely store a private key for signing or client TLS? | \n", + "Use openssl commands:\n", + "\n", + "- Generate RSA keys:\n", + " ```\n", + "openssl genpkey -algorithm RSA -out private_key_pkcs_8.pem -pkeyopt rsa_keygen_bits:4096\n", + "```\n", + " ```\n", + "o... | \n", + "⭐⭐⭐⭐ | \n", + "40.07s | \n", + "
| 16 | \n", + "Do I need a private key for bulk transfer API (v4/transfer) or only for certain operations? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐ | \n", + "25.73s | \n", + "
| 17 | \n", + "Which APIs or clients require a private key or TLS client certificate? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐ | \n", + "26.38s | \n", + "
| 18 | \n", + "What is the difference between v2/sale and v4/sale event streams for schema and ingestion? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "25.27s | \n", + "
| 19 | \n", + "What is the difference between raw sale events and sale-form funnel events in the warehouse? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐ | \n", + "26.13s | \n", + "
| 20 | \n", + "When will I receive a webhook or callback after an ingestion batch completes? | \n", + "После того, как исходная транзакция получит конечный статус.... | \n", + "⭐⭐⭐⭐ | \n", + "30.51s | \n", + "
| 21 | \n", + "How to validate callback or webhook origin (signature, allowlist, TLS)? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "29.4s | \n", + "
| 22 | \n", + "Should I implement both status polling and callback handling for pipelines? | \n", + "Нет, поскольку в документах указано, что для получения окончательного статуса нужно реализовать только одну из двух возможностей: статусы можно получи... | \n", + "⭐⭐⭐⭐ | \n", + "38.19s | \n", + "
| 23 | \n", + "How to build an operator-facing “finish” or success view after a load completes? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐ | \n", + "30.47s | \n", + "
| 24 | \n", + "How to parameterize dashboards or SQL safely without exposing raw secrets? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "29.53s | \n", + "
| 25 | \n", + "How to compute a control checksum for v2/sale payloads before insert into ClickHouse? | \n", + "control = SHA-1( ENDPOINTID | client_orderid | amount | email | merchant_control ) \n", + "\n", + "где ENDPOINTID — идентификатор эндпоинта, client_orderid — клиент... | \n", + "⭐⭐⭐⭐ | \n", + "47.13s | \n", + "
| 26 | \n", + "How to compute a control checksum for v4/transfer payloads before insert into ClickHouse? | \n", + "Для вычисления контрольной суммы для в4/transfer пакетов необходимо:\n", + "\n", + "1. Преобразовать параметры запроса к шестнадцатеричному представлению (как указа... | \n", + "⭐⭐⭐⭐ | \n", + "52.15s | \n", + "
| 27 | \n", + "What staging table or intermediate layer do we use for 3DS upload-method-url results? | \n", + "В документах не указано название конкретной таблицы или интермедиарного слоя.... | \n", + "⭐⭐⭐⭐ | \n", + "25.02s | \n", + "
| 28 | \n", + "Do I need a 3DS upload path in the pipeline for v2/sale? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "23.17s | \n", + "
| 29 | \n", + "Do I need a 3DS upload path in the pipeline for v2/sale-form? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "26.19s | \n", + "
| 30 | \n", + "Do I need a 3DS upload path in the pipeline for v2/return? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "25.22s | \n", + "
| 31 | \n", + "How to model reversal events (compensating rows or status flags)? | \n", + "Compensating rows или status flag Type: Enum Default: Yes.... | \n", + "⭐⭐⭐⭐ | \n", + "28.31s | \n", + "
| 32 | \n", + "How to model refund events and link them to the original transaction? | \n", + "Моделировать возвраты можно, используя статус «reversal», который указывает на полную или частичную отмену предыдущего одобренного транзакции. Этот ст... | \n", + "⭐⭐⭐⭐ | \n", + "43.28s | \n", + "
| 33 | \n", + "How to ingest Google Pay events into ClickHouse? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐ | \n", + "25.9s | \n", + "
| 34 | \n", + "How to ingest Apple Pay events into ClickHouse? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "26.87s | \n", + "
| 35 | \n", + "Where are test scenarios or golden datasets documented? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐ | \n", + "26.73s | \n", + "
| 36 | \n", + "Where is the Postman collection or equivalent HTTP examples for our loaders? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐ | \n", + "25.94s | \n", + "
| 37 | \n", + "Where is the schema for ingestion status responses (system tables, metadata tables)? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "27.42s | \n", + "
| 38 | \n", + "What insert or query throughput per node should we plan for in ClickHouse? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "26.87s | \n", + "
| 39 | \n", + "Which tenants or merchants are represented in the warehouse? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐ | \n", + "26.03s | \n", + "
| 40 | \n", + "Which currencies exist in the dimension table? | \n", + "Информация не найдена в документах.... | \n", + "⭐ | \n", + "26.2s | \n", + "
| 41 | \n", + "Which payment methods exist in the dimension table? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐ | \n", + "26.03s | \n", + "
| 42 | \n", + "Which transaction types are modeled in the fact table? | \n", + "sale, chargeback and amount of funds held.... | \n", + "⭐⭐⭐⭐ | \n", + "26.23s | \n", + "
| 43 | \n", + "How to ingest bank transfer events into ClickHouse? | \n", + "Информация не найдена в документах.... | \n", + "⭐ | \n", + "26.91s | \n", + "
| 44 | \n", + "How to export full transaction history (SELECT … FORMAT, object storage, backups)? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐ | \n", + "26.99s | \n", + "
| 45 | \n", + "What is the INSERT or JSONEachRow schema for raw v2/sale events? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "27.49s | \n", + "
| 46 | \n", + "What is the SQL or report definition for the transaction report? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐ | \n", + "26.46s | \n", + "
| 47 | \n", + "Is the control parameter mandatory for v2/sale ingestion? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "25.93s | \n", + "
| 48 | \n", + "Is the control parameter mandatory for v4/transfer ingestion? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "24.35s | \n", + "
| 49 | \n", + "What is RPI in the transfer API, and which column or surrogate key replaces it in ClickHouse? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "24.76s | \n", + "
| 50 | \n", + "What is the difference between storing RPI vs card number in policy terms? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "25.7s | \n", + "
| 51 | \n", + "Is it safe to store RPI in ClickHouse under our retention and access model? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐ | \n", + "26.03s | \n", + "
| 52 | \n", + "Is it safe to store card number in ClickHouse at all? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐ | \n", + "24.74s | \n", + "
| 53 | \n", + "Do we need PCI controls for storing raw v2/sale payloads? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐⭐⭐ | \n", + "26.04s | \n", + "
| 54 | \n", + "Do we need PCI controls for sale-form funnel events? | \n", + "Информация не найдена в документах.... | \n", + "⭐ | \n", + "26.71s | \n", + "
| 55 | \n", + "Do we need PCI DSS certification for integrating analytics with Payneteasy data? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐ | \n", + "30.22s | \n", + "
| 56 | \n", + "I received a merchant control key; how do I use it with ClickHouse (secrets manager, not plain table | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐ | \n", + "25.87s | \n", + "
| 57 | \n", + "What is your model and version? | \n", + "Похоже, что вопрос относятся к версии Doc2.0 Manager Manual. \n", + "\n", + "Модель: Doc2.0\n", + "Версия: не указана (в данном контексте).... | \n", + "⭐⭐⭐⭐ | \n", + "28.74s | \n", + "
| 58 | \n", + "When were you launched? | \n", + "Информация не найдена в документах.... | \n", + "⭐ | \n", + "25.64s | \n", + "
| 59 | \n", + "Do you collect my messages? | \n", + "Информация не найдена в документах.... | \n", + "⭐⭐ | \n", + "29.01s | \n", + "
\n", + " 💾 Полные результаты сохранены в benchmark_results.csv\n", + "
\n", + "| # | \n", + "Вопрос | \n", + "Ответ | \n", + "Оценка | \n", + "Время | \n", + "
|---|
\n", + " 💾 Полные результаты сохранены в benchmark_results.csv\n", + "
\n", + "