From 0273b2d74306842518f131faaa9bfa01fe2b5586 Mon Sep 17 00:00:00 2001
From: NicolasLMP <nicolas.lambropoulos54@gmail.com>
Date: Mon, 28 Jul 2025 12:02:52 +0200
Subject: [PATCH 1/2] feat : delete useless test

---
 .gitignore                   |   1 -
 flux_donnees.mmd             | 166 +++++++++++++++++++++++------------
 src/rag/semantic_pipeline.py |  63 -------------
 src/vectordb/interface.py    |   2 +-
 src/webapp/app.py            |   2 +-
 5 files changed, 114 insertions(+), 120 deletions(-)
diff --git a/.gitignore b/.gitignore
index c9aca18..997034a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -67,7 +67,6 @@ rag_evaluation/config/test_datasets/
 
 # Project specific files
 *.pkl
-*.faiss
 *.pdf
 *.docx
 *.xlsx
diff --git a/flux_donnees.mmd b/flux_donnees.mmd
index a05d660..bed8096 100644
--- a/flux_donnees.mmd
+++ b/flux_donnees.mmd
@@ -1,70 +1,128 @@
 graph TD
     %% Users and Interface
-    User([User]) -->|Connects| Login[Login Page]
-    Login -->|Verifies credentials| AuthDB[(SQLite Database<br>Users)]
-    Login -->|Authentication successful| Chat[Chat Interface]
-    Login -->|Admin access| Admin[User Management]
-    
-    %% Question Flow
-    User -->|Asks a question| Chat
-    Chat -->|Sends query| Retriever[FAISS Retriever]
-    
-    %% Document Search
-    Retriever -->|Vector similarity search| VectorDB[(FAISS Vector<br>Database)]
-    VectorDB -->|Relevant documents| Retriever
-    
-    %% Response Generation
-    Retriever -->|Document context| Chain[LangChain Chain]
-    Chain -->|Builds prompt| LLM[LLM Model<br>GPT 4o]
-    LLM -->|Generates response in French| Chain
-    Chain -->|Formatted response| Chat
-    Chat -->|Displays response + sources| User
-    
-    %% Database Initialization and Update
-    ConfluenceAPI[Confluence API] -->|Retrieves pages| DataLoader[DataLoader]
-    DataLoader -->|Processes documents| TextSplitter[Text Splitter]
-    TextSplitter -->|Splits into chunks| Embedder[Embeddings<br>HuggingFace]
-    Embedder -->|Vectors + Metadata| VectorDB
-    
-    %% User Management
-    Admin -->|Add/Remove| AuthDB
-    
-    %% Advanced Options
-    RebuildDB[Option: Rebuild DB] -.->|Triggers| DataLoader
+    User([User]) -->|Accesses| WebApp[Streamlit Web App]
+    User -->|CLI Commands| CLI[CLI Interface]
+    
+    %% Authentication Flow
+    WebApp -->|Authentication| AzureAuth[Azure AD Authentication]
+    AzureAuth -->|Valid credentials| ChatInterface[Chat Interface]
+    AzureAuth -->|Admin access| AdminInterface[Admin Interface]
+    
+    %% Main Chat Flow
+    User -->|Asks question| ChatInterface
+    ChatInterface -->|Query| SemanticPipeline[Semantic RAG Pipeline]
+    
+    %% Semantic Processing Pipeline
+    SemanticPipeline -->|Analyze query| QueryProcessor[Query Processor]
+    QueryProcessor -->|Intent & expansion| SemanticRetrieval[Semantic Retrieval Tool]
+    SemanticRetrieval -->|Vector search| WeaviateDB[(Weaviate Vector DB<br>Collection: isschat_docs)]
+    WeaviateDB -->|Relevant documents| SemanticRetrieval
+    SemanticRetrieval -->|Ranked results| GenerationTool[Generation Tool]
+    GenerationTool -->|Context + prompt| LLM[Gemini 2.5 Flash Lite]
+    LLM -->|Generated response| GenerationTool
+    GenerationTool -->|Final answer + sources| ChatInterface
+    
+    %% Data Storage & Management
+    GenerationTool -->|Save conversation| DataManager[Data Manager]
+    DataManager -->|Store data| StorageSystem{Storage System}
+    StorageSystem -->|Local| LocalStorage[(Local File Storage)]
+    StorageSystem -->|Cloud| AzureStorage[(Azure Blob Storage)]
+    
+    %% Features & History
+    ChatInterface -->|User feedback| FeaturesManager[Features Manager]
+    ChatInterface -->|Conversation history| HistoryManager[History Manager]
+    FeaturesManager -->|Feedback data| DataManager
+    HistoryManager -->|Load/save history| DataManager
+    
+    %% CLI Operations
+    CLI -->|Ingest command| IngestionPipeline[Confluence Ingestion Pipeline]
+    CLI -->|Status command| StatusCheck[System Status Check]
+    CLI -->|Query command| SemanticPipeline
+    CLI -->|Chat command| InteractiveCLI[Interactive CLI Chat]
+    
+    %% Data Ingestion Flow
+    IngestionPipeline -->|Extract| ConfluenceConnector[Confluence Connector]
+    ConfluenceConnector -->|Fetch pages| ConfluenceAPI[Confluence API]
+    ConfluenceConnector -->|Raw documents| DocumentProcessor[Document Processor]
+    DocumentProcessor -->|Clean & structure| DocumentChunker[Document Chunker]
+    DocumentChunker -->|Text chunks| EmbeddingService[Embedding Service<br>multilingual-e5-small]
+    EmbeddingService -->|Vector embeddings| WeaviateDB
+    
+    %% Configuration & Secrets
+    SemanticPipeline -.->|Config| ConfigManager[Configuration Manager]
+    ConfigManager -.->|Secrets| KeyVault[Azure Key Vault]
+    ConfigManager -.->|Settings| EnvFile[Environment Variables]
+    
+    %% Performance & Monitoring
+    AdminInterface -->|View metrics| PerformanceDashboard[Performance Dashboard]
+    PerformanceDashboard -->|Query stats| DataManager
     
     %% Subgraphs for organization
-    subgraph "User Interface"
+    subgraph "User Interface Layer"
         User
-        Login
-        Chat
-        Admin
-        RebuildDB
+        WebApp
+        CLI
+        ChatInterface
+        AdminInterface
+        InteractiveCLI
+    end
+    
+    subgraph "Authentication & Authorization"
+        AzureAuth
+        KeyVault
     end
     
-    subgraph "RAG Processing"
-        Retriever
-        Chain
+    subgraph "RAG Processing Engine"
+        SemanticPipeline
+        QueryProcessor
+        SemanticRetrieval
+        GenerationTool
         LLM
     end
     
-    subgraph "Data Storage"
-        VectorDB
-        AuthDB
+    subgraph "Data Storage Layer"
+        WeaviateDB
+        DataManager
+        StorageSystem
+        LocalStorage
+        AzureStorage
     end
     
-    subgraph "Data ingestion"
+    subgraph "Application Components"
+        FeaturesManager
+        HistoryManager
+        PerformanceDashboard
+        ConfigManager
+    end
+    
+    subgraph "Data Ingestion Pipeline"
+        IngestionPipeline
+        ConfluenceConnector
         ConfluenceAPI
-        DataLoader
-        TextSplitter
-        Embedder
+        DocumentProcessor
+        DocumentChunker
+        EmbeddingService
+    end
+    
+    subgraph "Configuration & Environment"
+        ConfigManager
+        KeyVault
+        EnvFile
     end
     
-    classDef interface fill:#f9f,stroke:#333,stroke-width:1px
-    classDef processing fill:#bbf,stroke:#333,stroke-width:1px
-    classDef storage fill:#bfb,stroke:#333,stroke-width:1px
-    classDef ingestion fill:#fbb,stroke:#333,stroke-width:1px
+    %% Styling
+    classDef interface fill:#e1f5fe,stroke:#0277bd,stroke-width:2px
+    classDef auth fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
+    classDef processing fill:#e8f5e8,stroke:#2e7d32,stroke-width:2px
+    classDef storage fill:#fff3e0,stroke:#ef6c00,stroke-width:2px
+    classDef components fill:#fce4ec,stroke:#c2185b,stroke-width:2px
+    classDef ingestion fill:#f1f8e9,stroke:#558b2f,stroke-width:2px
+    classDef config fill:#f5f5f5,stroke:#616161,stroke-width:2px
     
-    class User,Login,Chat,Admin,RebuildDB interface
-    class Retriever,Chain,LLM processing
-    class VectorDB,AuthDB storage
-    class ConfluenceAPI,DataLoader,TextSplitter,Embedder ingestion
\ No newline at end of file
+    class User,WebApp,CLI,ChatInterface,AdminInterface,InteractiveCLI interface
+    class AzureAuth,KeyVault auth
+    class SemanticPipeline,QueryProcessor,SemanticRetrieval,GenerationTool,LLM processing
+    class WeaviateDB,DataManager,StorageSystem,LocalStorage,AzureStorage storage
+    class FeaturesManager,HistoryManager,PerformanceDashboard components
+    class IngestionPipeline,ConfluenceConnector,ConfluenceAPI,DocumentProcessor,DocumentChunker,EmbeddingService ingestion
+    class ConfigManager,EnvFile config
\ No newline at end of file
diff --git a/src/rag/semantic_pipeline.py b/src/rag/semantic_pipeline.py
index 3efd8ed..20f47aa 100644
--- a/src/rag/semantic_pipeline.py
+++ b/src/rag/semantic_pipeline.py
@@ -239,51 +239,6 @@ def compare_with_basic_retrieval(self, query: str, k: int = 5) -> Dict[str, Any]
         except Exception as e:
             return {"error": str(e), "query": query}
 
-    def test_problematic_query(self, query: str = "qui sont les collaborateurs sur Isschat") -> Dict[str, Any]:
-        """
-        Test the pipeline with the specific problematic query about collaborators.
-
-        Args:
-            query: The problematic query to test
-
-        Returns:
-            Detailed test results
-        """
-        try:
-            # Test with full semantic pipeline
-            start_time = time.time()
-            answer, sources = self.process_query(query, verbose=True)
-            response_time = (time.time() - start_time) * 1000
-
-            # Get comparison data
-            comparison = self.compare_with_basic_retrieval(query)
-
-            # Analyze if the answer contains team information
-            team_keywords = ["vincent", "nicolas", "emin", "fraillon", "lambropoulos", "calyaka", "équipe", "team"]
-            answer_lower = answer.lower()
-            team_mentions = [keyword for keyword in team_keywords if keyword in answer_lower]
-
-            return {
-                "test_query": query,
-                "semantic_pipeline_result": {
-                    "answer": answer,
-                    "sources": sources,
-                    "response_time_ms": response_time,
-                    "team_keywords_found": team_mentions,
-                    "contains_team_info": len(team_mentions) > 2,
-                },
-                "comparison": comparison,
-                "success_criteria": {
-                    "finds_team_info": len(team_mentions) > 2,
-                    "mentions_specific_names": any(name in answer_lower for name in ["vincent", "nicolas", "emin"]),
-                    "better_than_basic": comparison.get("improvement_metrics", {}).get("semantic_advantage", False),
-                },
-                "pipeline_status": self.get_status(),
-            }
-
-        except Exception as e:
-            return {"error": str(e), "test_query": query}
-
     def _format_sources_for_storage(self, formatted_docs) -> list[dict]:
         """Format sources for storage with enhanced metadata"""
         sources = []
@@ -351,24 +306,6 @@ def get_status(self) -> Dict[str, Any]:
         except Exception as e:
             return {"pipeline_type": "semantic_rag_pipeline", "ready": False, "error": str(e)}
 
-    def check_pipeline(self, test_query: str = "qui sont les collaborateurs sur Isschat") -> Dict[str, Any]:
-        """Check pipeline with default problematic query"""
-        try:
-            if not self.is_ready():
-                return {"success": False, "error": "Pipeline not ready", "details": self.get_status()}
-
-            # Run the problematic query test
-            test_result = self.test_problematic_query(test_query)
-
-            return {
-                "success": test_result.get("success_criteria", {}).get("finds_team_info", False),
-                "test_result": test_result,
-                "pipeline_status": self.get_status(),
-            }
-
-        except Exception as e:
-            return {"success": False, "error": str(e), "test_query": test_query}
-
 
 class SemanticRAGPipelineFactory:
     """Factory for creating semantic RAG pipelines"""
diff --git a/src/vectordb/interface.py b/src/vectordb/interface.py
index f39d4cf..e481e3a 100644
--- a/src/vectordb/interface.py
+++ b/src/vectordb/interface.py
@@ -1,6 +1,6 @@
 """
 Vector database interface for clean abstractions.
-Supports both Qdrant and FAISS implementations.
+Supports Weaviate vector database implementation.
 """
 
 from abc import ABC, abstractmethod
diff --git a/src/webapp/app.py b/src/webapp/app.py
index def2a4f..d25520c 100644
--- a/src/webapp/app.py
+++ b/src/webapp/app.py
@@ -113,7 +113,7 @@ def initialize_embedder():
 def get_model(rebuild_db=False):
     # Display a spinner during loading
     with st.spinner("Loading RAG model..."):
-        # Check if the index.faiss file exists
+        # Initialize RAG model
         from src.config.settings import get_debug_info
 
         # Get debug info

From 933dd909a8086be35f8bf5846fb9da4b84eedaa0 Mon Sep 17 00:00:00 2001
From: NicolasLMP <nicolas.lambropoulos54@gmail.com>
Date: Mon, 28 Jul 2025 15:31:56 +0200
Subject: [PATCH 2/2] feat : deleting any occurences of FAISS

---
 pyproject.toml |  1 -
 uv.lock        | 24 ------------------------
 2 files changed, 25 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c1d7778..a04a21d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,6 @@ dependencies = [
     "streamlit>=1.31.0",
     "langchain-core>=0.2.0",
     "langchain-text-splitters>=0.2.0",
-    "faiss-cpu>=1.7.4",
     "pandas>=2.0.0",
     "python-dotenv>=1.0.0",
     "huggingface-hub[hf-xet]>=0.19.0",
diff --git a/uv.lock b/uv.lock
index ea0ced3..53ac286 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1073,28 +1073,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c3/5f/9dd2090e20cd39244a21ebb8b95589d6fa6ceecc587ed943290847d47df1/evaluate-0.4.4-py3-none-any.whl", hash = "sha256:e7e10d2617847a6127f023dd444ba241c7a1c8e3e081f72b15f35686e8220dbd", size = 84097, upload-time = "2025-06-20T17:48:18.004Z" },
 ]
 
-[[package]]
-name = "faiss-cpu"
-version = "1.11.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "numpy" },
-    { name = "packaging" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/e7/9a/e33fc563f007924dd4ec3c5101fe5320298d6c13c158a24a9ed849058569/faiss_cpu-1.11.0.tar.gz", hash = "sha256:44877b896a2b30a61e35ea4970d008e8822545cb340eca4eff223ac7f40a1db9", size = 70218, upload-time = "2025-04-28T07:48:30.459Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3b/d3/7178fa07047fd770964a83543329bb5e3fc1447004cfd85186ccf65ec3ee/faiss_cpu-1.11.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:356437b9a46f98c25831cdae70ca484bd6c05065af6256d87f6505005e9135b9", size = 3313807, upload-time = "2025-04-28T07:47:54.533Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/71/25f5f7b70a9f22a3efe19e7288278da460b043a3b60ad98e4e47401ed5aa/faiss_cpu-1.11.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:c4a3d35993e614847f3221c6931529c0bac637a00eff0d55293e1db5cb98c85f", size = 7913537, upload-time = "2025-04-28T07:47:56.723Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/c8/a5cb8466c981ad47750e1d5fda3d4223c82f9da947538749a582b3a2d35c/faiss_cpu-1.11.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8f9af33e0b8324e8199b93eb70ac4a951df02802a9dcff88e9afc183b11666f0", size = 3785180, upload-time = "2025-04-28T07:47:59.004Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/37/eaf15a7d80e1aad74f56cf737b31b4547a1a664ad3c6e4cfaf90e82454a8/faiss_cpu-1.11.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:48b7e7876829e6bdf7333041800fa3c1753bb0c47e07662e3ef55aca86981430", size = 31287630, upload-time = "2025-04-28T07:48:01.248Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/5c/902a78347e9c47baaf133e47863134e564c39f9afe105795b16ee986b0df/faiss_cpu-1.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:bdc199311266d2be9d299da52361cad981393327b2b8aa55af31a1b75eaaf522", size = 15005398, upload-time = "2025-04-28T07:48:04.232Z" },
-    { url = "https://files.pythonhosted.org/packages/92/90/d2329ce56423cc61f4c20ae6b4db001c6f88f28bf5a7ef7f8bbc246fd485/faiss_cpu-1.11.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:0c98e5feff83b87348e44eac4d578d6f201780dae6f27f08a11d55536a20b3a8", size = 3313807, upload-time = "2025-04-28T07:48:06.486Z" },
-    { url = "https://files.pythonhosted.org/packages/24/14/8af8f996d54e6097a86e6048b1a2c958c52dc985eb4f935027615079939e/faiss_cpu-1.11.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:796e90389427b1c1fb06abdb0427bb343b6350f80112a2e6090ac8f176ff7416", size = 7913539, upload-time = "2025-04-28T07:48:08.338Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/2b/437c2f36c3aa3cffe041479fced1c76420d3e92e1f434f1da3be3e6f32b1/faiss_cpu-1.11.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:2b6e355dda72b3050991bc32031b558b8f83a2b3537a2b9e905a84f28585b47e", size = 3785181, upload-time = "2025-04-28T07:48:10.594Z" },
-    { url = "https://files.pythonhosted.org/packages/66/75/955527414371843f558234df66fa0b62c6e86e71e4022b1be9333ac6004c/faiss_cpu-1.11.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6c482d07194638c169b4422774366e7472877d09181ea86835e782e6304d4185", size = 31287635, upload-time = "2025-04-28T07:48:12.93Z" },
-    { url = "https://files.pythonhosted.org/packages/50/51/35b7a3f47f7859363a367c344ae5d415ea9eda65db0a7d497c7ea2c0b576/faiss_cpu-1.11.0-cp313-cp313-win_amd64.whl", hash = "sha256:13eac45299532b10e911bff1abbb19d1bf5211aa9e72afeade653c3f1e50e042", size = 15005455, upload-time = "2025-04-28T07:48:16.173Z" },
-]
-
 [[package]]
 name = "filelock"
 version = "3.18.0"
@@ -1577,7 +1555,6 @@ dependencies = [
     { name = "beautifulsoup4" },
     { name = "dotenv" },
     { name = "einops" },
-    { name = "faiss-cpu" },
     { name = "huggingface-hub", extra = ["hf-xet"] },
     { name = "langchain" },
     { name = "langchain-community" },
@@ -1648,7 +1625,6 @@ requires-dist = [
     { name = "dvc", extras = ["azure"], marker = "extra == 'evaluation'", specifier = ">=3.0.0" },
     { name = "einops", specifier = ">=0.8.1" },
     { name = "evaluate", marker = "extra == 'evaluation'", specifier = ">=0.4.0" },
-    { name = "faiss-cpu", specifier = ">=1.7.4" },
     { name = "huggingface-hub", extras = ["hf-xet"], specifier = ">=0.19.0" },
     { name = "langchain", specifier = ">=0.2.0" },
     { name = "langchain-community", specifier = ">=0.2.0" },