From 0273b2d74306842518f131faaa9bfa01fe2b5586 Mon Sep 17 00:00:00 2001 From: NicolasLMP Date: Mon, 28 Jul 2025 12:02:52 +0200 Subject: [PATCH 1/2] feat : delete useless test --- .gitignore | 1 - flux_donnees.mmd | 166 +++++++++++++++++++++++------------ src/rag/semantic_pipeline.py | 63 ------------- src/vectordb/interface.py | 2 +- src/webapp/app.py | 2 +- 5 files changed, 114 insertions(+), 120 deletions(-) diff --git a/.gitignore b/.gitignore index c9aca18..997034a 100644 --- a/.gitignore +++ b/.gitignore @@ -67,7 +67,6 @@ rag_evaluation/config/test_datasets/ # Project specific files *.pkl -*.faiss *.pdf *.docx *.xlsx diff --git a/flux_donnees.mmd b/flux_donnees.mmd index a05d660..bed8096 100644 --- a/flux_donnees.mmd +++ b/flux_donnees.mmd @@ -1,70 +1,128 @@ graph TD %% Users and Interface - User([User]) -->|Connects| Login[Login Page] - Login -->|Verifies credentials| AuthDB[(SQLite Database
Users)] - Login -->|Authentication successful| Chat[Chat Interface] - Login -->|Admin access| Admin[User Management] - - %% Question Flow - User -->|Asks a question| Chat - Chat -->|Sends query| Retriever[FAISS Retriever] - - %% Document Search - Retriever -->|Vector similarity search| VectorDB[(FAISS Vector
Database)] - VectorDB -->|Relevant documents| Retriever - - %% Response Generation - Retriever -->|Document context| Chain[LangChain Chain] - Chain -->|Builds prompt| LLM[LLM Model
GPT 4o] - LLM -->|Generates response in French| Chain - Chain -->|Formatted response| Chat - Chat -->|Displays response + sources| User - - %% Database Initialization and Update - ConfluenceAPI[Confluence API] -->|Retrieves pages| DataLoader[DataLoader] - DataLoader -->|Processes documents| TextSplitter[Text Splitter] - TextSplitter -->|Splits into chunks| Embedder[Embeddings
HuggingFace] - Embedder -->|Vectors + Metadata| VectorDB - - %% User Management - Admin -->|Add/Remove| AuthDB - - %% Advanced Options - RebuildDB[Option: Rebuild DB] -.->|Triggers| DataLoader + User([User]) -->|Accesses| WebApp[Streamlit Web App] + User -->|CLI Commands| CLI[CLI Interface] + + %% Authentication Flow + WebApp -->|Authentication| AzureAuth[Azure AD Authentication] + AzureAuth -->|Valid credentials| ChatInterface[Chat Interface] + AzureAuth -->|Admin access| AdminInterface[Admin Interface] + + %% Main Chat Flow + User -->|Asks question| ChatInterface + ChatInterface -->|Query| SemanticPipeline[Semantic RAG Pipeline] + + %% Semantic Processing Pipeline + SemanticPipeline -->|Analyze query| QueryProcessor[Query Processor] + QueryProcessor -->|Intent & expansion| SemanticRetrieval[Semantic Retrieval Tool] + SemanticRetrieval -->|Vector search| WeaviateDB[(Weaviate Vector DB
Collection: isschat_docs)] + WeaviateDB -->|Relevant documents| SemanticRetrieval + SemanticRetrieval -->|Ranked results| GenerationTool[Generation Tool] + GenerationTool -->|Context + prompt| LLM[Gemini 2.5 Flash Lite] + LLM -->|Generated response| GenerationTool + GenerationTool -->|Final answer + sources| ChatInterface + + %% Data Storage & Management + GenerationTool -->|Save conversation| DataManager[Data Manager] + DataManager -->|Store data| StorageSystem{Storage System} + StorageSystem -->|Local| LocalStorage[(Local File Storage)] + StorageSystem -->|Cloud| AzureStorage[(Azure Blob Storage)] + + %% Features & History + ChatInterface -->|User feedback| FeaturesManager[Features Manager] + ChatInterface -->|Conversation history| HistoryManager[History Manager] + FeaturesManager -->|Feedback data| DataManager + HistoryManager -->|Load/save history| DataManager + + %% CLI Operations + CLI -->|Ingest command| IngestionPipeline[Confluence Ingestion Pipeline] + CLI -->|Status command| StatusCheck[System Status Check] + CLI -->|Query command| SemanticPipeline + CLI -->|Chat command| InteractiveCLI[Interactive CLI Chat] + + %% Data Ingestion Flow + IngestionPipeline -->|Extract| ConfluenceConnector[Confluence Connector] + ConfluenceConnector -->|Fetch pages| ConfluenceAPI[Confluence API] + ConfluenceConnector -->|Raw documents| DocumentProcessor[Document Processor] + DocumentProcessor -->|Clean & structure| DocumentChunker[Document Chunker] + DocumentChunker -->|Text chunks| EmbeddingService[Embedding Service
multilingual-e5-small] + EmbeddingService -->|Vector embeddings| WeaviateDB + + %% Configuration & Secrets + SemanticPipeline -.->|Config| ConfigManager[Configuration Manager] + ConfigManager -.->|Secrets| KeyVault[Azure Key Vault] + ConfigManager -.->|Settings| EnvFile[Environment Variables] + + %% Performance & Monitoring + AdminInterface -->|View metrics| PerformanceDashboard[Performance Dashboard] + PerformanceDashboard -->|Query stats| DataManager %% Subgraphs for organization - subgraph "User Interface" + subgraph "User Interface Layer" User - Login - Chat - Admin - RebuildDB + WebApp + CLI + ChatInterface + AdminInterface + InteractiveCLI + end + + subgraph "Authentication & Authorization" + AzureAuth + KeyVault end - subgraph "RAG Processing" - Retriever - Chain + subgraph "RAG Processing Engine" + SemanticPipeline + QueryProcessor + SemanticRetrieval + GenerationTool LLM end - subgraph "Data Storage" - VectorDB - AuthDB + subgraph "Data Storage Layer" + WeaviateDB + DataManager + StorageSystem + LocalStorage + AzureStorage end - subgraph "Data ingestion" + subgraph "Application Components" + FeaturesManager + HistoryManager + PerformanceDashboard + ConfigManager + end + + subgraph "Data Ingestion Pipeline" + IngestionPipeline + ConfluenceConnector ConfluenceAPI - DataLoader - TextSplitter - Embedder + DocumentProcessor + DocumentChunker + EmbeddingService + end + + subgraph "Configuration & Environment" + ConfigManager + KeyVault + EnvFile end - classDef interface fill:#f9f,stroke:#333,stroke-width:1px - classDef processing fill:#bbf,stroke:#333,stroke-width:1px - classDef storage fill:#bfb,stroke:#333,stroke-width:1px - classDef ingestion fill:#fbb,stroke:#333,stroke-width:1px + %% Styling + classDef interface fill:#e1f5fe,stroke:#0277bd,stroke-width:2px + classDef auth fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px + classDef processing fill:#e8f5e8,stroke:#2e7d32,stroke-width:2px + classDef storage fill:#fff3e0,stroke:#ef6c00,stroke-width:2px + classDef components fill:#fce4ec,stroke:#c2185b,stroke-width:2px + classDef ingestion fill:#f1f8e9,stroke:#558b2f,stroke-width:2px + classDef config fill:#f5f5f5,stroke:#616161,stroke-width:2px - class User,Login,Chat,Admin,RebuildDB interface - class Retriever,Chain,LLM processing - class VectorDB,AuthDB storage - class ConfluenceAPI,DataLoader,TextSplitter,Embedder ingestion \ No newline at end of file + class User,WebApp,CLI,ChatInterface,AdminInterface,InteractiveCLI interface + class AzureAuth,KeyVault auth + class SemanticPipeline,QueryProcessor,SemanticRetrieval,GenerationTool,LLM processing + class WeaviateDB,DataManager,StorageSystem,LocalStorage,AzureStorage storage + class FeaturesManager,HistoryManager,PerformanceDashboard components + class IngestionPipeline,ConfluenceConnector,ConfluenceAPI,DocumentProcessor,DocumentChunker,EmbeddingService ingestion + class ConfigManager,EnvFile config \ No newline at end of file diff --git a/src/rag/semantic_pipeline.py b/src/rag/semantic_pipeline.py index 3efd8ed..20f47aa 100644 --- a/src/rag/semantic_pipeline.py +++ b/src/rag/semantic_pipeline.py @@ -239,51 +239,6 @@ def compare_with_basic_retrieval(self, query: str, k: int = 5) -> Dict[str, Any] except Exception as e: return {"error": str(e), "query": query} - def test_problematic_query(self, query: str = "qui sont les collaborateurs sur Isschat") -> Dict[str, Any]: - """ - Test the pipeline with the specific problematic query about collaborators. - - Args: - query: The problematic query to test - - Returns: - Detailed test results - """ - try: - # Test with full semantic pipeline - start_time = time.time() - answer, sources = self.process_query(query, verbose=True) - response_time = (time.time() - start_time) * 1000 - - # Get comparison data - comparison = self.compare_with_basic_retrieval(query) - - # Analyze if the answer contains team information - team_keywords = ["vincent", "nicolas", "emin", "fraillon", "lambropoulos", "calyaka", "équipe", "team"] - answer_lower = answer.lower() - team_mentions = [keyword for keyword in team_keywords if keyword in answer_lower] - - return { - "test_query": query, - "semantic_pipeline_result": { - "answer": answer, - "sources": sources, - "response_time_ms": response_time, - "team_keywords_found": team_mentions, - "contains_team_info": len(team_mentions) > 2, - }, - "comparison": comparison, - "success_criteria": { - "finds_team_info": len(team_mentions) > 2, - "mentions_specific_names": any(name in answer_lower for name in ["vincent", "nicolas", "emin"]), - "better_than_basic": comparison.get("improvement_metrics", {}).get("semantic_advantage", False), - }, - "pipeline_status": self.get_status(), - } - - except Exception as e: - return {"error": str(e), "test_query": query} - def _format_sources_for_storage(self, formatted_docs) -> list[dict]: """Format sources for storage with enhanced metadata""" sources = [] @@ -351,24 +306,6 @@ def get_status(self) -> Dict[str, Any]: except Exception as e: return {"pipeline_type": "semantic_rag_pipeline", "ready": False, "error": str(e)} - def check_pipeline(self, test_query: str = "qui sont les collaborateurs sur Isschat") -> Dict[str, Any]: - """Check pipeline with default problematic query""" - try: - if not self.is_ready(): - return {"success": False, "error": "Pipeline not ready", "details": self.get_status()} - - # Run the problematic query test - test_result = self.test_problematic_query(test_query) - - return { - "success": test_result.get("success_criteria", {}).get("finds_team_info", False), - "test_result": test_result, - "pipeline_status": self.get_status(), - } - - except Exception as e: - return {"success": False, "error": str(e), "test_query": test_query} - class SemanticRAGPipelineFactory: """Factory for creating semantic RAG pipelines""" diff --git a/src/vectordb/interface.py b/src/vectordb/interface.py index f39d4cf..e481e3a 100644 --- a/src/vectordb/interface.py +++ b/src/vectordb/interface.py @@ -1,6 +1,6 @@ """ Vector database interface for clean abstractions. -Supports both Qdrant and FAISS implementations. +Supports Weaviate vector database implementation. """ from abc import ABC, abstractmethod diff --git a/src/webapp/app.py b/src/webapp/app.py index def2a4f..d25520c 100644 --- a/src/webapp/app.py +++ b/src/webapp/app.py @@ -113,7 +113,7 @@ def initialize_embedder(): def get_model(rebuild_db=False): # Display a spinner during loading with st.spinner("Loading RAG model..."): - # Check if the index.faiss file exists + # Initialize RAG model from src.config.settings import get_debug_info # Get debug info From 933dd909a8086be35f8bf5846fb9da4b84eedaa0 Mon Sep 17 00:00:00 2001 From: NicolasLMP Date: Mon, 28 Jul 2025 15:31:56 +0200 Subject: [PATCH 2/2] feat : deleting any occurences of FAISS --- pyproject.toml | 1 - uv.lock | 24 ------------------------ 2 files changed, 25 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c1d7778..a04a21d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,6 @@ dependencies = [ "streamlit>=1.31.0", "langchain-core>=0.2.0", "langchain-text-splitters>=0.2.0", - "faiss-cpu>=1.7.4", "pandas>=2.0.0", "python-dotenv>=1.0.0", "huggingface-hub[hf-xet]>=0.19.0", diff --git a/uv.lock b/uv.lock index ea0ced3..53ac286 100644 --- a/uv.lock +++ b/uv.lock @@ -1073,28 +1073,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c3/5f/9dd2090e20cd39244a21ebb8b95589d6fa6ceecc587ed943290847d47df1/evaluate-0.4.4-py3-none-any.whl", hash = "sha256:e7e10d2617847a6127f023dd444ba241c7a1c8e3e081f72b15f35686e8220dbd", size = 84097, upload-time = "2025-06-20T17:48:18.004Z" }, ] -[[package]] -name = "faiss-cpu" -version = "1.11.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, - { name = "packaging" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/e7/9a/e33fc563f007924dd4ec3c5101fe5320298d6c13c158a24a9ed849058569/faiss_cpu-1.11.0.tar.gz", hash = "sha256:44877b896a2b30a61e35ea4970d008e8822545cb340eca4eff223ac7f40a1db9", size = 70218, upload-time = "2025-04-28T07:48:30.459Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3b/d3/7178fa07047fd770964a83543329bb5e3fc1447004cfd85186ccf65ec3ee/faiss_cpu-1.11.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:356437b9a46f98c25831cdae70ca484bd6c05065af6256d87f6505005e9135b9", size = 3313807, upload-time = "2025-04-28T07:47:54.533Z" }, - { url = "https://files.pythonhosted.org/packages/9e/71/25f5f7b70a9f22a3efe19e7288278da460b043a3b60ad98e4e47401ed5aa/faiss_cpu-1.11.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:c4a3d35993e614847f3221c6931529c0bac637a00eff0d55293e1db5cb98c85f", size = 7913537, upload-time = "2025-04-28T07:47:56.723Z" }, - { url = "https://files.pythonhosted.org/packages/b0/c8/a5cb8466c981ad47750e1d5fda3d4223c82f9da947538749a582b3a2d35c/faiss_cpu-1.11.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8f9af33e0b8324e8199b93eb70ac4a951df02802a9dcff88e9afc183b11666f0", size = 3785180, upload-time = "2025-04-28T07:47:59.004Z" }, - { url = "https://files.pythonhosted.org/packages/7f/37/eaf15a7d80e1aad74f56cf737b31b4547a1a664ad3c6e4cfaf90e82454a8/faiss_cpu-1.11.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:48b7e7876829e6bdf7333041800fa3c1753bb0c47e07662e3ef55aca86981430", size = 31287630, upload-time = "2025-04-28T07:48:01.248Z" }, - { url = "https://files.pythonhosted.org/packages/ff/5c/902a78347e9c47baaf133e47863134e564c39f9afe105795b16ee986b0df/faiss_cpu-1.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:bdc199311266d2be9d299da52361cad981393327b2b8aa55af31a1b75eaaf522", size = 15005398, upload-time = "2025-04-28T07:48:04.232Z" }, - { url = "https://files.pythonhosted.org/packages/92/90/d2329ce56423cc61f4c20ae6b4db001c6f88f28bf5a7ef7f8bbc246fd485/faiss_cpu-1.11.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:0c98e5feff83b87348e44eac4d578d6f201780dae6f27f08a11d55536a20b3a8", size = 3313807, upload-time = "2025-04-28T07:48:06.486Z" }, - { url = "https://files.pythonhosted.org/packages/24/14/8af8f996d54e6097a86e6048b1a2c958c52dc985eb4f935027615079939e/faiss_cpu-1.11.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:796e90389427b1c1fb06abdb0427bb343b6350f80112a2e6090ac8f176ff7416", size = 7913539, upload-time = "2025-04-28T07:48:08.338Z" }, - { url = "https://files.pythonhosted.org/packages/b2/2b/437c2f36c3aa3cffe041479fced1c76420d3e92e1f434f1da3be3e6f32b1/faiss_cpu-1.11.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:2b6e355dda72b3050991bc32031b558b8f83a2b3537a2b9e905a84f28585b47e", size = 3785181, upload-time = "2025-04-28T07:48:10.594Z" }, - { url = "https://files.pythonhosted.org/packages/66/75/955527414371843f558234df66fa0b62c6e86e71e4022b1be9333ac6004c/faiss_cpu-1.11.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6c482d07194638c169b4422774366e7472877d09181ea86835e782e6304d4185", size = 31287635, upload-time = "2025-04-28T07:48:12.93Z" }, - { url = "https://files.pythonhosted.org/packages/50/51/35b7a3f47f7859363a367c344ae5d415ea9eda65db0a7d497c7ea2c0b576/faiss_cpu-1.11.0-cp313-cp313-win_amd64.whl", hash = "sha256:13eac45299532b10e911bff1abbb19d1bf5211aa9e72afeade653c3f1e50e042", size = 15005455, upload-time = "2025-04-28T07:48:16.173Z" }, -] - [[package]] name = "filelock" version = "3.18.0" @@ -1577,7 +1555,6 @@ dependencies = [ { name = "beautifulsoup4" }, { name = "dotenv" }, { name = "einops" }, - { name = "faiss-cpu" }, { name = "huggingface-hub", extra = ["hf-xet"] }, { name = "langchain" }, { name = "langchain-community" }, @@ -1648,7 +1625,6 @@ requires-dist = [ { name = "dvc", extras = ["azure"], marker = "extra == 'evaluation'", specifier = ">=3.0.0" }, { name = "einops", specifier = ">=0.8.1" }, { name = "evaluate", marker = "extra == 'evaluation'", specifier = ">=0.4.0" }, - { name = "faiss-cpu", specifier = ">=1.7.4" }, { name = "huggingface-hub", extras = ["hf-xet"], specifier = ">=0.19.0" }, { name = "langchain", specifier = ">=0.2.0" }, { name = "langchain-community", specifier = ">=0.2.0" },