diff --git a/.gitignore b/.gitignore
index c9aca18..997034a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -67,7 +67,6 @@ rag_evaluation/config/test_datasets/
# Project specific files
*.pkl
-*.faiss
*.pdf
*.docx
*.xlsx
diff --git a/flux_donnees.mmd b/flux_donnees.mmd
index a05d660..bed8096 100644
--- a/flux_donnees.mmd
+++ b/flux_donnees.mmd
@@ -1,70 +1,128 @@
graph TD
%% Users and Interface
- User([User]) -->|Connects| Login[Login Page]
- Login -->|Verifies credentials| AuthDB[(SQLite Database
Users)]
- Login -->|Authentication successful| Chat[Chat Interface]
- Login -->|Admin access| Admin[User Management]
-
- %% Question Flow
- User -->|Asks a question| Chat
- Chat -->|Sends query| Retriever[FAISS Retriever]
-
- %% Document Search
- Retriever -->|Vector similarity search| VectorDB[(FAISS Vector
Database)]
- VectorDB -->|Relevant documents| Retriever
-
- %% Response Generation
- Retriever -->|Document context| Chain[LangChain Chain]
- Chain -->|Builds prompt| LLM[LLM Model
GPT 4o]
- LLM -->|Generates response in French| Chain
- Chain -->|Formatted response| Chat
- Chat -->|Displays response + sources| User
-
- %% Database Initialization and Update
- ConfluenceAPI[Confluence API] -->|Retrieves pages| DataLoader[DataLoader]
- DataLoader -->|Processes documents| TextSplitter[Text Splitter]
- TextSplitter -->|Splits into chunks| Embedder[Embeddings
HuggingFace]
- Embedder -->|Vectors + Metadata| VectorDB
-
- %% User Management
- Admin -->|Add/Remove| AuthDB
-
- %% Advanced Options
- RebuildDB[Option: Rebuild DB] -.->|Triggers| DataLoader
+ User([User]) -->|Accesses| WebApp[Streamlit Web App]
+ User -->|CLI Commands| CLI[CLI Interface]
+
+ %% Authentication Flow
+ WebApp -->|Authentication| AzureAuth[Azure AD Authentication]
+ AzureAuth -->|Valid credentials| ChatInterface[Chat Interface]
+ AzureAuth -->|Admin access| AdminInterface[Admin Interface]
+
+ %% Main Chat Flow
+ User -->|Asks question| ChatInterface
+ ChatInterface -->|Query| SemanticPipeline[Semantic RAG Pipeline]
+
+ %% Semantic Processing Pipeline
+ SemanticPipeline -->|Analyze query| QueryProcessor[Query Processor]
+ QueryProcessor -->|Intent & expansion| SemanticRetrieval[Semantic Retrieval Tool]
+ SemanticRetrieval -->|Vector search| WeaviateDB[(Weaviate Vector DB
Collection: isschat_docs)]
+ WeaviateDB -->|Relevant documents| SemanticRetrieval
+ SemanticRetrieval -->|Ranked results| GenerationTool[Generation Tool]
+ GenerationTool -->|Context + prompt| LLM[Gemini 2.5 Flash Lite]
+ LLM -->|Generated response| GenerationTool
+ GenerationTool -->|Final answer + sources| ChatInterface
+
+ %% Data Storage & Management
+ GenerationTool -->|Save conversation| DataManager[Data Manager]
+ DataManager -->|Store data| StorageSystem{Storage System}
+ StorageSystem -->|Local| LocalStorage[(Local File Storage)]
+ StorageSystem -->|Cloud| AzureStorage[(Azure Blob Storage)]
+
+ %% Features & History
+ ChatInterface -->|User feedback| FeaturesManager[Features Manager]
+ ChatInterface -->|Conversation history| HistoryManager[History Manager]
+ FeaturesManager -->|Feedback data| DataManager
+ HistoryManager -->|Load/save history| DataManager
+
+ %% CLI Operations
+ CLI -->|Ingest command| IngestionPipeline[Confluence Ingestion Pipeline]
+ CLI -->|Status command| StatusCheck[System Status Check]
+ CLI -->|Query command| SemanticPipeline
+ CLI -->|Chat command| InteractiveCLI[Interactive CLI Chat]
+
+ %% Data Ingestion Flow
+ IngestionPipeline -->|Extract| ConfluenceConnector[Confluence Connector]
+ ConfluenceConnector -->|Fetch pages| ConfluenceAPI[Confluence API]
+ ConfluenceConnector -->|Raw documents| DocumentProcessor[Document Processor]
+ DocumentProcessor -->|Clean & structure| DocumentChunker[Document Chunker]
+ DocumentChunker -->|Text chunks| EmbeddingService[Embedding Service
multilingual-e5-small]
+ EmbeddingService -->|Vector embeddings| WeaviateDB
+
+ %% Configuration & Secrets
+ SemanticPipeline -.->|Config| ConfigManager[Configuration Manager]
+ ConfigManager -.->|Secrets| KeyVault[Azure Key Vault]
+ ConfigManager -.->|Settings| EnvFile[Environment Variables]
+
+ %% Performance & Monitoring
+ AdminInterface -->|View metrics| PerformanceDashboard[Performance Dashboard]
+ PerformanceDashboard -->|Query stats| DataManager
%% Subgraphs for organization
- subgraph "User Interface"
+ subgraph "User Interface Layer"
User
- Login
- Chat
- Admin
- RebuildDB
+ WebApp
+ CLI
+ ChatInterface
+ AdminInterface
+ InteractiveCLI
+ end
+
+ subgraph "Authentication & Authorization"
+ AzureAuth
+ KeyVault
end
- subgraph "RAG Processing"
- Retriever
- Chain
+ subgraph "RAG Processing Engine"
+ SemanticPipeline
+ QueryProcessor
+ SemanticRetrieval
+ GenerationTool
LLM
end
- subgraph "Data Storage"
- VectorDB
- AuthDB
+ subgraph "Data Storage Layer"
+ WeaviateDB
+ DataManager
+ StorageSystem
+ LocalStorage
+ AzureStorage
end
- subgraph "Data ingestion"
+ subgraph "Application Components"
+ FeaturesManager
+ HistoryManager
+ PerformanceDashboard
+ ConfigManager
+ end
+
+ subgraph "Data Ingestion Pipeline"
+ IngestionPipeline
+ ConfluenceConnector
ConfluenceAPI
- DataLoader
- TextSplitter
- Embedder
+ DocumentProcessor
+ DocumentChunker
+ EmbeddingService
+ end
+
+ subgraph "Configuration & Environment"
+ ConfigManager
+ KeyVault
+ EnvFile
end
- classDef interface fill:#f9f,stroke:#333,stroke-width:1px
- classDef processing fill:#bbf,stroke:#333,stroke-width:1px
- classDef storage fill:#bfb,stroke:#333,stroke-width:1px
- classDef ingestion fill:#fbb,stroke:#333,stroke-width:1px
+ %% Styling
+ classDef interface fill:#e1f5fe,stroke:#0277bd,stroke-width:2px
+ classDef auth fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
+ classDef processing fill:#e8f5e8,stroke:#2e7d32,stroke-width:2px
+ classDef storage fill:#fff3e0,stroke:#ef6c00,stroke-width:2px
+ classDef components fill:#fce4ec,stroke:#c2185b,stroke-width:2px
+ classDef ingestion fill:#f1f8e9,stroke:#558b2f,stroke-width:2px
+ classDef config fill:#f5f5f5,stroke:#616161,stroke-width:2px
- class User,Login,Chat,Admin,RebuildDB interface
- class Retriever,Chain,LLM processing
- class VectorDB,AuthDB storage
- class ConfluenceAPI,DataLoader,TextSplitter,Embedder ingestion
\ No newline at end of file
+ class User,WebApp,CLI,ChatInterface,AdminInterface,InteractiveCLI interface
+ class AzureAuth,KeyVault auth
+ class SemanticPipeline,QueryProcessor,SemanticRetrieval,GenerationTool,LLM processing
+ class WeaviateDB,DataManager,StorageSystem,LocalStorage,AzureStorage storage
+ class FeaturesManager,HistoryManager,PerformanceDashboard components
+ class IngestionPipeline,ConfluenceConnector,ConfluenceAPI,DocumentProcessor,DocumentChunker,EmbeddingService ingestion
+ class ConfigManager,EnvFile config
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index c1d7778..a04a21d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,6 @@ dependencies = [
"streamlit>=1.31.0",
"langchain-core>=0.2.0",
"langchain-text-splitters>=0.2.0",
- "faiss-cpu>=1.7.4",
"pandas>=2.0.0",
"python-dotenv>=1.0.0",
"huggingface-hub[hf-xet]>=0.19.0",
diff --git a/src/rag/semantic_pipeline.py b/src/rag/semantic_pipeline.py
index 3efd8ed..20f47aa 100644
--- a/src/rag/semantic_pipeline.py
+++ b/src/rag/semantic_pipeline.py
@@ -239,51 +239,6 @@ def compare_with_basic_retrieval(self, query: str, k: int = 5) -> Dict[str, Any]
except Exception as e:
return {"error": str(e), "query": query}
- def test_problematic_query(self, query: str = "qui sont les collaborateurs sur Isschat") -> Dict[str, Any]:
- """
- Test the pipeline with the specific problematic query about collaborators.
-
- Args:
- query: The problematic query to test
-
- Returns:
- Detailed test results
- """
- try:
- # Test with full semantic pipeline
- start_time = time.time()
- answer, sources = self.process_query(query, verbose=True)
- response_time = (time.time() - start_time) * 1000
-
- # Get comparison data
- comparison = self.compare_with_basic_retrieval(query)
-
- # Analyze if the answer contains team information
- team_keywords = ["vincent", "nicolas", "emin", "fraillon", "lambropoulos", "calyaka", "équipe", "team"]
- answer_lower = answer.lower()
- team_mentions = [keyword for keyword in team_keywords if keyword in answer_lower]
-
- return {
- "test_query": query,
- "semantic_pipeline_result": {
- "answer": answer,
- "sources": sources,
- "response_time_ms": response_time,
- "team_keywords_found": team_mentions,
- "contains_team_info": len(team_mentions) > 2,
- },
- "comparison": comparison,
- "success_criteria": {
- "finds_team_info": len(team_mentions) > 2,
- "mentions_specific_names": any(name in answer_lower for name in ["vincent", "nicolas", "emin"]),
- "better_than_basic": comparison.get("improvement_metrics", {}).get("semantic_advantage", False),
- },
- "pipeline_status": self.get_status(),
- }
-
- except Exception as e:
- return {"error": str(e), "test_query": query}
-
def _format_sources_for_storage(self, formatted_docs) -> list[dict]:
"""Format sources for storage with enhanced metadata"""
sources = []
@@ -351,24 +306,6 @@ def get_status(self) -> Dict[str, Any]:
except Exception as e:
return {"pipeline_type": "semantic_rag_pipeline", "ready": False, "error": str(e)}
- def check_pipeline(self, test_query: str = "qui sont les collaborateurs sur Isschat") -> Dict[str, Any]:
- """Check pipeline with default problematic query"""
- try:
- if not self.is_ready():
- return {"success": False, "error": "Pipeline not ready", "details": self.get_status()}
-
- # Run the problematic query test
- test_result = self.test_problematic_query(test_query)
-
- return {
- "success": test_result.get("success_criteria", {}).get("finds_team_info", False),
- "test_result": test_result,
- "pipeline_status": self.get_status(),
- }
-
- except Exception as e:
- return {"success": False, "error": str(e), "test_query": test_query}
-
class SemanticRAGPipelineFactory:
"""Factory for creating semantic RAG pipelines"""
diff --git a/src/vectordb/interface.py b/src/vectordb/interface.py
index f39d4cf..e481e3a 100644
--- a/src/vectordb/interface.py
+++ b/src/vectordb/interface.py
@@ -1,6 +1,6 @@
"""
Vector database interface for clean abstractions.
-Supports both Qdrant and FAISS implementations.
+Supports Weaviate vector database implementation.
"""
from abc import ABC, abstractmethod
diff --git a/src/webapp/app.py b/src/webapp/app.py
index def2a4f..d25520c 100644
--- a/src/webapp/app.py
+++ b/src/webapp/app.py
@@ -113,7 +113,7 @@ def initialize_embedder():
def get_model(rebuild_db=False):
# Display a spinner during loading
with st.spinner("Loading RAG model..."):
- # Check if the index.faiss file exists
+ # Initialize RAG model
from src.config.settings import get_debug_info
# Get debug info
diff --git a/uv.lock b/uv.lock
index ea0ced3..53ac286 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1073,28 +1073,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c3/5f/9dd2090e20cd39244a21ebb8b95589d6fa6ceecc587ed943290847d47df1/evaluate-0.4.4-py3-none-any.whl", hash = "sha256:e7e10d2617847a6127f023dd444ba241c7a1c8e3e081f72b15f35686e8220dbd", size = 84097, upload-time = "2025-06-20T17:48:18.004Z" },
]
-[[package]]
-name = "faiss-cpu"
-version = "1.11.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
- { name = "numpy" },
- { name = "packaging" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/e7/9a/e33fc563f007924dd4ec3c5101fe5320298d6c13c158a24a9ed849058569/faiss_cpu-1.11.0.tar.gz", hash = "sha256:44877b896a2b30a61e35ea4970d008e8822545cb340eca4eff223ac7f40a1db9", size = 70218, upload-time = "2025-04-28T07:48:30.459Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/3b/d3/7178fa07047fd770964a83543329bb5e3fc1447004cfd85186ccf65ec3ee/faiss_cpu-1.11.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:356437b9a46f98c25831cdae70ca484bd6c05065af6256d87f6505005e9135b9", size = 3313807, upload-time = "2025-04-28T07:47:54.533Z" },
- { url = "https://files.pythonhosted.org/packages/9e/71/25f5f7b70a9f22a3efe19e7288278da460b043a3b60ad98e4e47401ed5aa/faiss_cpu-1.11.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:c4a3d35993e614847f3221c6931529c0bac637a00eff0d55293e1db5cb98c85f", size = 7913537, upload-time = "2025-04-28T07:47:56.723Z" },
- { url = "https://files.pythonhosted.org/packages/b0/c8/a5cb8466c981ad47750e1d5fda3d4223c82f9da947538749a582b3a2d35c/faiss_cpu-1.11.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8f9af33e0b8324e8199b93eb70ac4a951df02802a9dcff88e9afc183b11666f0", size = 3785180, upload-time = "2025-04-28T07:47:59.004Z" },
- { url = "https://files.pythonhosted.org/packages/7f/37/eaf15a7d80e1aad74f56cf737b31b4547a1a664ad3c6e4cfaf90e82454a8/faiss_cpu-1.11.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:48b7e7876829e6bdf7333041800fa3c1753bb0c47e07662e3ef55aca86981430", size = 31287630, upload-time = "2025-04-28T07:48:01.248Z" },
- { url = "https://files.pythonhosted.org/packages/ff/5c/902a78347e9c47baaf133e47863134e564c39f9afe105795b16ee986b0df/faiss_cpu-1.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:bdc199311266d2be9d299da52361cad981393327b2b8aa55af31a1b75eaaf522", size = 15005398, upload-time = "2025-04-28T07:48:04.232Z" },
- { url = "https://files.pythonhosted.org/packages/92/90/d2329ce56423cc61f4c20ae6b4db001c6f88f28bf5a7ef7f8bbc246fd485/faiss_cpu-1.11.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:0c98e5feff83b87348e44eac4d578d6f201780dae6f27f08a11d55536a20b3a8", size = 3313807, upload-time = "2025-04-28T07:48:06.486Z" },
- { url = "https://files.pythonhosted.org/packages/24/14/8af8f996d54e6097a86e6048b1a2c958c52dc985eb4f935027615079939e/faiss_cpu-1.11.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:796e90389427b1c1fb06abdb0427bb343b6350f80112a2e6090ac8f176ff7416", size = 7913539, upload-time = "2025-04-28T07:48:08.338Z" },
- { url = "https://files.pythonhosted.org/packages/b2/2b/437c2f36c3aa3cffe041479fced1c76420d3e92e1f434f1da3be3e6f32b1/faiss_cpu-1.11.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:2b6e355dda72b3050991bc32031b558b8f83a2b3537a2b9e905a84f28585b47e", size = 3785181, upload-time = "2025-04-28T07:48:10.594Z" },
- { url = "https://files.pythonhosted.org/packages/66/75/955527414371843f558234df66fa0b62c6e86e71e4022b1be9333ac6004c/faiss_cpu-1.11.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6c482d07194638c169b4422774366e7472877d09181ea86835e782e6304d4185", size = 31287635, upload-time = "2025-04-28T07:48:12.93Z" },
- { url = "https://files.pythonhosted.org/packages/50/51/35b7a3f47f7859363a367c344ae5d415ea9eda65db0a7d497c7ea2c0b576/faiss_cpu-1.11.0-cp313-cp313-win_amd64.whl", hash = "sha256:13eac45299532b10e911bff1abbb19d1bf5211aa9e72afeade653c3f1e50e042", size = 15005455, upload-time = "2025-04-28T07:48:16.173Z" },
-]
-
[[package]]
name = "filelock"
version = "3.18.0"
@@ -1577,7 +1555,6 @@ dependencies = [
{ name = "beautifulsoup4" },
{ name = "dotenv" },
{ name = "einops" },
- { name = "faiss-cpu" },
{ name = "huggingface-hub", extra = ["hf-xet"] },
{ name = "langchain" },
{ name = "langchain-community" },
@@ -1648,7 +1625,6 @@ requires-dist = [
{ name = "dvc", extras = ["azure"], marker = "extra == 'evaluation'", specifier = ">=3.0.0" },
{ name = "einops", specifier = ">=0.8.1" },
{ name = "evaluate", marker = "extra == 'evaluation'", specifier = ">=0.4.0" },
- { name = "faiss-cpu", specifier = ">=1.7.4" },
{ name = "huggingface-hub", extras = ["hf-xet"], specifier = ">=0.19.0" },
{ name = "langchain", specifier = ">=0.2.0" },
{ name = "langchain-community", specifier = ">=0.2.0" },