Isskar · NicolasLMP · Jul 28, 2025 · Jul 28, 2025 · Jul 28, 2025
diff --git a/.gitignore b/.gitignore
@@ -67,7 +67,6 @@ rag_evaluation/config/test_datasets/
 
 # Project specific files
 *.pkl
-*.faiss
 *.pdf
 *.docx
 *.xlsx

diff --git a/flux_donnees.mmd b/flux_donnees.mmd
@@ -1,70 +1,128 @@
 graph TD
     %% Users and Interface
-    User([User]) -->|Connects| Login[Login Page]
-    Login -->|Verifies credentials| AuthDB[(SQLite Database<br>Users)]
-    Login -->|Authentication successful| Chat[Chat Interface]
-    Login -->|Admin access| Admin[User Management]
-
-    %% Question Flow
-    User -->|Asks a question| Chat
-    Chat -->|Sends query| Retriever[FAISS Retriever]
-
-    %% Document Search
-    Retriever -->|Vector similarity search| VectorDB[(FAISS Vector<br>Database)]
-    VectorDB -->|Relevant documents| Retriever
-
-    %% Response Generation
-    Retriever -->|Document context| Chain[LangChain Chain]
-    Chain -->|Builds prompt| LLM[LLM Model<br>GPT 4o]
-    LLM -->|Generates response in French| Chain
-    Chain -->|Formatted response| Chat
-    Chat -->|Displays response + sources| User
-
-    %% Database Initialization and Update
-    ConfluenceAPI[Confluence API] -->|Retrieves pages| DataLoader[DataLoader]
-    DataLoader -->|Processes documents| TextSplitter[Text Splitter]
-    TextSplitter -->|Splits into chunks| Embedder[Embeddings<br>HuggingFace]
-    Embedder -->|Vectors + Metadata| VectorDB
-
-    %% User Management
-    Admin -->|Add/Remove| AuthDB
-
-    %% Advanced Options
-    RebuildDB[Option: Rebuild DB] -.->|Triggers| DataLoader
+    User([User]) -->|Accesses| WebApp[Streamlit Web App]
+    User -->|CLI Commands| CLI[CLI Interface]
+
+    %% Authentication Flow
+    WebApp -->|Authentication| AzureAuth[Azure AD Authentication]
+    AzureAuth -->|Valid credentials| ChatInterface[Chat Interface]
+    AzureAuth -->|Admin access| AdminInterface[Admin Interface]
+
+    %% Main Chat Flow
+    User -->|Asks question| ChatInterface
+    ChatInterface -->|Query| SemanticPipeline[Semantic RAG Pipeline]
+
+    %% Semantic Processing Pipeline
+    SemanticPipeline -->|Analyze query| QueryProcessor[Query Processor]
+    QueryProcessor -->|Intent & expansion| SemanticRetrieval[Semantic Retrieval Tool]
+    SemanticRetrieval -->|Vector search| WeaviateDB[(Weaviate Vector DB<br>Collection: isschat_docs)]
+    WeaviateDB -->|Relevant documents| SemanticRetrieval
+    SemanticRetrieval -->|Ranked results| GenerationTool[Generation Tool]
+    GenerationTool -->|Context + prompt| LLM[Gemini 2.5 Flash Lite]
+    LLM -->|Generated response| GenerationTool
+    GenerationTool -->|Final answer + sources| ChatInterface
+
+    %% Data Storage & Management
+    GenerationTool -->|Save conversation| DataManager[Data Manager]
+    DataManager -->|Store data| StorageSystem{Storage System}
+    StorageSystem -->|Local| LocalStorage[(Local File Storage)]
+    StorageSystem -->|Cloud| AzureStorage[(Azure Blob Storage)]
+
+    %% Features & History
+    ChatInterface -->|User feedback| FeaturesManager[Features Manager]
+    ChatInterface -->|Conversation history| HistoryManager[History Manager]
+    FeaturesManager -->|Feedback data| DataManager
+    HistoryManager -->|Load/save history| DataManager
+
+    %% CLI Operations
+    CLI -->|Ingest command| IngestionPipeline[Confluence Ingestion Pipeline]
+    CLI -->|Status command| StatusCheck[System Status Check]
+    CLI -->|Query command| SemanticPipeline
+    CLI -->|Chat command| InteractiveCLI[Interactive CLI Chat]
+
+    %% Data Ingestion Flow
+    IngestionPipeline -->|Extract| ConfluenceConnector[Confluence Connector]
+    ConfluenceConnector -->|Fetch pages| ConfluenceAPI[Confluence API]
+    ConfluenceConnector -->|Raw documents| DocumentProcessor[Document Processor]
+    DocumentProcessor -->|Clean & structure| DocumentChunker[Document Chunker]
+    DocumentChunker -->|Text chunks| EmbeddingService[Embedding Service<br>multilingual-e5-small]
+    EmbeddingService -->|Vector embeddings| WeaviateDB
+
+    %% Configuration & Secrets
+    SemanticPipeline -.->|Config| ConfigManager[Configuration Manager]
+    ConfigManager -.->|Secrets| KeyVault[Azure Key Vault]
+    ConfigManager -.->|Settings| EnvFile[Environment Variables]
+
+    %% Performance & Monitoring
+    AdminInterface -->|View metrics| PerformanceDashboard[Performance Dashboard]
+    PerformanceDashboard -->|Query stats| DataManager
 
     %% Subgraphs for organization
-    subgraph "User Interface"
+    subgraph "User Interface Layer"
         User
-        Login
-        Chat
-        Admin
-        RebuildDB
+        WebApp
+        CLI
+        ChatInterface
+        AdminInterface
+        InteractiveCLI
+    end
+
+    subgraph "Authentication & Authorization"
+        AzureAuth
+        KeyVault
     end
 
-    subgraph "RAG Processing"
-        Retriever
-        Chain
+    subgraph "RAG Processing Engine"
+        SemanticPipeline
+        QueryProcessor
+        SemanticRetrieval
+        GenerationTool
         LLM
     end
 
-    subgraph "Data Storage"
-        VectorDB
-        AuthDB
+    subgraph "Data Storage Layer"
+        WeaviateDB
+        DataManager
+        StorageSystem
+        LocalStorage
+        AzureStorage
     end
 
-    subgraph "Data ingestion"
+    subgraph "Application Components"
+        FeaturesManager
+        HistoryManager
+        PerformanceDashboard
+        ConfigManager
+    end
+
+    subgraph "Data Ingestion Pipeline"
+        IngestionPipeline
+        ConfluenceConnector
         ConfluenceAPI
-        DataLoader
-        TextSplitter
-        Embedder
+        DocumentProcessor
+        DocumentChunker
+        EmbeddingService
+    end
+
+    subgraph "Configuration & Environment"
+        ConfigManager
+        KeyVault
+        EnvFile
     end
 
-    classDef interface fill:#f9f,stroke:#333,stroke-width:1px
-    classDef processing fill:#bbf,stroke:#333,stroke-width:1px
-    classDef storage fill:#bfb,stroke:#333,stroke-width:1px
-    classDef ingestion fill:#fbb,stroke:#333,stroke-width:1px
+    %% Styling
+    classDef interface fill:#e1f5fe,stroke:#0277bd,stroke-width:2px
+    classDef auth fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
+    classDef processing fill:#e8f5e8,stroke:#2e7d32,stroke-width:2px
+    classDef storage fill:#fff3e0,stroke:#ef6c00,stroke-width:2px
+    classDef components fill:#fce4ec,stroke:#c2185b,stroke-width:2px
+    classDef ingestion fill:#f1f8e9,stroke:#558b2f,stroke-width:2px
+    classDef config fill:#f5f5f5,stroke:#616161,stroke-width:2px
 
-    class User,Login,Chat,Admin,RebuildDB interface
-    class Retriever,Chain,LLM processing
-    class VectorDB,AuthDB storage
-    class ConfluenceAPI,DataLoader,TextSplitter,Embedder ingestion
+    class User,WebApp,CLI,ChatInterface,AdminInterface,InteractiveCLI interface
+    class AzureAuth,KeyVault auth
+    class SemanticPipeline,QueryProcessor,SemanticRetrieval,GenerationTool,LLM processing
+    class WeaviateDB,DataManager,StorageSystem,LocalStorage,AzureStorage storage
+    class FeaturesManager,HistoryManager,PerformanceDashboard components
+    class IngestionPipeline,ConfluenceConnector,ConfluenceAPI,DocumentProcessor,DocumentChunker,EmbeddingService ingestion
+    class ConfigManager,EnvFile config
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,6 @@ dependencies = [
     "streamlit>=1.31.0",
     "langchain-core>=0.2.0",
     "langchain-text-splitters>=0.2.0",
-    "faiss-cpu>=1.7.4",
     "pandas>=2.0.0",
     "python-dotenv>=1.0.0",
     "huggingface-hub[hf-xet]>=0.19.0",

diff --git a/src/rag/semantic_pipeline.py b/src/rag/semantic_pipeline.py
@@ -239,51 +239,6 @@ def compare_with_basic_retrieval(self, query: str, k: int = 5) -> Dict[str, Any]
         except Exception as e:
             return {"error": str(e), "query": query}
 
-    def test_problematic_query(self, query: str = "qui sont les collaborateurs sur Isschat") -> Dict[str, Any]:
-        """
-        Test the pipeline with the specific problematic query about collaborators.
-
-        Args:
-            query: The problematic query to test
-
-        Returns:
-            Detailed test results
-        """
-        try:
-            # Test with full semantic pipeline
-            start_time = time.time()
-            answer, sources = self.process_query(query, verbose=True)
-            response_time = (time.time() - start_time) * 1000
-
-            # Get comparison data
-            comparison = self.compare_with_basic_retrieval(query)
-
-            # Analyze if the answer contains team information
-            team_keywords = ["vincent", "nicolas", "emin", "fraillon", "lambropoulos", "calyaka", "équipe", "team"]
-            answer_lower = answer.lower()
-            team_mentions = [keyword for keyword in team_keywords if keyword in answer_lower]
-
-            return {
-                "test_query": query,
-                "semantic_pipeline_result": {
-                    "answer": answer,
-                    "sources": sources,
-                    "response_time_ms": response_time,
-                    "team_keywords_found": team_mentions,
-                    "contains_team_info": len(team_mentions) > 2,
-                },
-                "comparison": comparison,
-                "success_criteria": {
-                    "finds_team_info": len(team_mentions) > 2,
-                    "mentions_specific_names": any(name in answer_lower for name in ["vincent", "nicolas", "emin"]),
-                    "better_than_basic": comparison.get("improvement_metrics", {}).get("semantic_advantage", False),
-                },
-                "pipeline_status": self.get_status(),
-            }
-
-        except Exception as e:
-            return {"error": str(e), "test_query": query}
-
     def _format_sources_for_storage(self, formatted_docs) -> list[dict]:
         """Format sources for storage with enhanced metadata"""
         sources = []
@@ -351,24 +306,6 @@ def get_status(self) -> Dict[str, Any]:
         except Exception as e:
             return {"pipeline_type": "semantic_rag_pipeline", "ready": False, "error": str(e)}
 
-    def check_pipeline(self, test_query: str = "qui sont les collaborateurs sur Isschat") -> Dict[str, Any]:
-        """Check pipeline with default problematic query"""
-        try:
-            if not self.is_ready():
-                return {"success": False, "error": "Pipeline not ready", "details": self.get_status()}
-
-            # Run the problematic query test
-            test_result = self.test_problematic_query(test_query)
-
-            return {
-                "success": test_result.get("success_criteria", {}).get("finds_team_info", False),
-                "test_result": test_result,
-                "pipeline_status": self.get_status(),
-            }
-
-        except Exception as e:
-            return {"success": False, "error": str(e), "test_query": test_query}
-
 
 class SemanticRAGPipelineFactory:
     """Factory for creating semantic RAG pipelines"""

diff --git a/src/vectordb/interface.py b/src/vectordb/interface.py
@@ -1,6 +1,6 @@
 """
 Vector database interface for clean abstractions.
-Supports both Qdrant and FAISS implementations.
+Supports Weaviate vector database implementation.
 """
 
 from abc import ABC, abstractmethod

diff --git a/src/webapp/app.py b/src/webapp/app.py
@@ -113,7 +113,7 @@ def initialize_embedder():
 def get_model(rebuild_db=False):
     # Display a spinner during loading
     with st.spinner("Loading RAG model..."):
-        # Check if the index.faiss file exists
+        # Initialize RAG model
         from src.config.settings import get_debug_info
 
         # Get debug info

diff --git a/uv.lock b/uv.lock
-Original file line number
+Diff line change
@@ Expand Up / @@ -67,7 +67,6 @@ rag_evaluation/config/test_datasets/ @@
     # Project specific files
     *.pkl
-    *.faiss
     *.pdf
     *.docx
     *.xlsx
@@ Expand Down @@