From b4a507d79819725921c2fccbc90ffec9319480bb Mon Sep 17 00:00:00 2001
From: Xu Wenhao <xuwenhao@shoppal.ai>
Date: Tue, 2 Jan 2024 10:27:37 +0000
Subject: [PATCH 01/12] add dev container settings

---
 .devcontainer/Dockerfile           | 29 +++++++++++++++++
 .devcontainer/devcontainer.json    | 52 ++++++++++++++++++++++++++++++
 .devcontainer/docker-compose.yml   | 13 ++++++++
 .devcontainer/postCreateCommand.sh |  6 ++++
 .gitignore                         |  1 +
 requirements-dev.txt               |  4 +++
 server.py                          |  2 +-
 7 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100644 .devcontainer/Dockerfile
 create mode 100644 .devcontainer/devcontainer.json
 create mode 100644 .devcontainer/docker-compose.yml
 create mode 100755 .devcontainer/postCreateCommand.sh
 create mode 100644 .gitignore
 create mode 100644 requirements-dev.txt

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
new file mode 100644
index 0000000..0ad824b
--- /dev/null
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,29 @@
+FROM python:3.8
+
+RUN pip install --no-cache-dir --upgrade pip
+RUN apt update && apt install -y zsh curl git sudo wget libsndfile1
+
+ARG USERNAME=vscode
+ARG USER_UID=1000
+ARG USER_GID=$USER_UID
+
+RUN groupadd --gid $USER_GID $USERNAME \
+    && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME
+
+RUN usermod -aG sudo $USERNAME
+RUN echo 'vscode ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+
+USER $USERNAME
+
+RUN cd ~ && wget https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh && sh install.sh
+
+# Create and activate a Python virtual environment
+RUN python -m venv ~/venv
+RUN echo "source ~/venv/bin/activate" >> ~/.zshrc
+
+# Set Python path in the virtual environment.
+RUN echo "export PYTHONPATH=\$PYTHONPATH:/workspace" >> ~/.zshrc
+RUN /bin/zsh ~/.zshrc
+
+ENV DEBIAN_FRONTEND=dialog
+
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 0000000..0ae1534
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,52 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/postgres
+{
+    "name": "VoiceStreamAI Demo",
+    "dockerComposeFile": "docker-compose.yml",
+    "service": "app",
+    "workspaceFolder": "/workspace",
+    "customizations": {
+        "vscode": {
+            "settings": {
+                "terminal.integrated.defaultProfile.linux": "zsh", 
+                "terminal.integrated.profiles.linux": {
+                    "zsh": {
+                        "path": "/bin/zsh"
+                    }
+                }
+            },            
+            "extensions": [
+                "GitHub.copilot",
+                "GitHub.copilot-labs",
+                "GitHub.vscode-pull-request-github",
+                "ms-python.python",
+                "ms-python.vscode-pylance",
+                "ms-python.pylint",
+                "ms-python.isort",
+                "ms-python.black-formatter",
+                "matangover.mypy",
+                "ms-toolsai.jupyter",
+                "ms-toolsai.jupyter-keymap",
+                "ms-toolsai.vscode-jupyter-slideshow",
+                "eamodio.gitlens",
+                "github.vscode-github-actions"
+            ]
+        }
+    },
+    // Features to add to the dev container. More info: https://containers.dev/features.
+    // "features": {},
+
+    // Use 'forwardPorts' to make a list of ports inside the container available locally.
+    // This can be used to network with other containers or the host.
+    "forwardPorts": [8765],
+
+    // Use 'postCreateCommand' to run commands after the container is created.
+    "postCreateCommand": "./.devcontainer/postCreateCommand.sh"
+
+    // Configure tool-specific properties.
+    // "customizations": {},
+
+    // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+    // "remoteUser": "root"
+}
+
diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml
new file mode 100644
index 0000000..2c7b43a
--- /dev/null
+++ b/.devcontainer/docker-compose.yml
@@ -0,0 +1,13 @@
+version: '3.8'
+
+services:
+  app:
+    build:
+      context: ..
+      dockerfile: .devcontainer/Dockerfile
+    volumes:
+      - ..:/workspace
+    ports:
+      - "8765:8765"
+    user: vscode
+    command: sleep infinity
\ No newline at end of file
diff --git a/.devcontainer/postCreateCommand.sh b/.devcontainer/postCreateCommand.sh
new file mode 100755
index 0000000..a3dcd06
--- /dev/null
+++ b/.devcontainer/postCreateCommand.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+source /home/vscode/venv/bin/activate
+pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+pip install -r requirements.txt
+pip install -r requirements-dev.txt
+sudo chown vscode:vscode /workspace
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4c49bd7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..0f9b57a
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,4 @@
+transformers
+pyannote.core
+pyannote.audio
+websockets
diff --git a/server.py b/server.py
index 6f3a7b0..3f5c2b5 100644
--- a/server.py
+++ b/server.py
@@ -25,7 +25,7 @@
 AUDIO_CHANNELS = 1
 SAMPLES_WIDTH = 2 # int16
 DEBUG = True
-VAD_AUTH_TOKEN = "FILL ME" # get your key here -> https://huggingface.co/pyannote/segmentation
+VAD_AUTH_TOKEN = os.environ.get("HF_TOKEN") # get your key here -> https://huggingface.co/pyannote/segmentation
 
 DEFAULT_CLIENT_CONFIG = {
     "language" : None, # multilingual

From 9564d1996b5c2ce1663226166d2784823d4e2bfb Mon Sep 17 00:00:00 2001
From: Xu Wenhao <xuwenhao@shoppal.ai>
Date: Tue, 2 Jan 2024 14:34:35 +0000
Subject: [PATCH 02/12] basic dev container setup

---
 .devcontainer/Dockerfile           | 14 ++++++++++----
 .devcontainer/devcontainer.json    |  6 +++++-
 .devcontainer/docker-compose.yml   | 12 ++++++++++--
 .devcontainer/postCreateCommand.sh |  2 +-
 .env.example                       |  4 ++++
 .gitignore                         |  1 +
 requirements-dev.txt               |  1 +
 7 files changed, 32 insertions(+), 8 deletions(-)
 create mode 100644 .env.example

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 0ad824b..d344f52 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -1,8 +1,15 @@
-FROM python:3.8
+FROM ghcr.io/shoppal-ai/llm-base
+
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
 
-RUN pip install --no-cache-dir --upgrade pip
 RUN apt update && apt install -y zsh curl git sudo wget libsndfile1
 
+# # install python environment
+# RUN apt install -y python3.8-dev python3-pip 
+# RUN update-alternatives --install /usr/bin/pthon python /usr/bin/python3.10 1 
+RUN apt install -y python3.10-venv
+
 ARG USERNAME=vscode
 ARG USER_UID=1000
 ARG USER_GID=$USER_UID
@@ -17,8 +24,7 @@ USER $USERNAME
 
 RUN cd ~ && wget https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh && sh install.sh
 
-# Create and activate a Python virtual environment
-RUN python -m venv ~/venv
+RUN python3 -m venv ~/venv
 RUN echo "source ~/venv/bin/activate" >> ~/.zshrc
 
 # Set Python path in the virtual environment.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 0ae1534..ebb4024 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -4,6 +4,7 @@
     "name": "VoiceStreamAI Demo",
     "dockerComposeFile": "docker-compose.yml",
     "service": "app",
+    "workspaceMount": "source=~/VoiceStreamAI,target=/workspace,type=bind,consistency=cached",
     "workspaceFolder": "/workspace",
     "customizations": {
         "vscode": {
@@ -40,13 +41,16 @@
     // This can be used to network with other containers or the host.
     "forwardPorts": [8765],
 
+    // Run Args to use GPU
+    // "runArgs": ["--gpus", "all"],
     // Use 'postCreateCommand' to run commands after the container is created.
     "postCreateCommand": "./.devcontainer/postCreateCommand.sh"
 
     // Configure tool-specific properties.
     // "customizations": {},
 
+
     // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
-    // "remoteUser": "root"
+    // "remoteUser": "root",
 }
 
diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml
index 2c7b43a..856e39c 100644
--- a/.devcontainer/docker-compose.yml
+++ b/.devcontainer/docker-compose.yml
@@ -4,10 +4,18 @@ services:
   app:
     build:
       context: ..
+      args:
+        HTTP_PROXY: ${HTTP_PROXY}
+        HTTPS_PROXY: ${HTTPS_PROXY}
       dockerfile: .devcontainer/Dockerfile
     volumes:
-      - ..:/workspace
+      - ~/VoiceStreamAI:/workspace:cached
     ports:
       - "8765:8765"
     user: vscode
-    command: sleep infinity
\ No newline at end of file
+    command: sleep infinity
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - capabilities: [gpu]
\ No newline at end of file
diff --git a/.devcontainer/postCreateCommand.sh b/.devcontainer/postCreateCommand.sh
index a3dcd06..7109689 100755
--- a/.devcontainer/postCreateCommand.sh
+++ b/.devcontainer/postCreateCommand.sh
@@ -3,4 +3,4 @@ source /home/vscode/venv/bin/activate
 pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
 pip install -r requirements.txt
 pip install -r requirements-dev.txt
-sudo chown vscode:vscode /workspace
+sudo chown vscode:vscode /workspace
\ No newline at end of file
diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..136fcc3
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,4 @@
+HTTP_PROXY=
+HTTPS_PROXY=
+HF_ENDPOINT=https://hf-mirror.com
+HF_TOKEN=
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 4c49bd7..159be9e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 .env
+SHOPPAL_README.md
\ No newline at end of file
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 0f9b57a..00b23dd 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,4 +1,5 @@
 transformers
 pyannote.core
 pyannote.audio
+torchvision
 websockets

From 9244bfc3f0525432fef167a6ad9892eaaf324cb5 Mon Sep 17 00:00:00 2001
From: Xu Wenhao <xuwenhao@shoppal.ai>
Date: Tue, 2 Jan 2024 15:19:05 +0000
Subject: [PATCH 03/12] now it works in dev container

---
 .devcontainer/Dockerfile | 2 +-
 .gitignore               | 3 ++-
 requirements-dev.txt     | 3 ++-
 server.py                | 6 +++++-
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index d344f52..c12810b 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -3,7 +3,7 @@ FROM ghcr.io/shoppal-ai/llm-base
 ARG HTTP_PROXY
 ARG HTTPS_PROXY
 
-RUN apt update && apt install -y zsh curl git sudo wget libsndfile1
+RUN apt update && apt install -y zsh curl git sudo wget libsndfile1 ffmpeg
 
 # # install python environment
 # RUN apt install -y python3.8-dev python3-pip 
diff --git a/.gitignore b/.gitignore
index 159be9e..47fadcd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .env
-SHOPPAL_README.md
\ No newline at end of file
+SHOPPAL_README.md
+audio_files/*
\ No newline at end of file
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 00b23dd..4788a7f 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,5 +1,6 @@
-transformers
+ffmpeg
 pyannote.core
 pyannote.audio
 torchvision
+transformers
 websockets
diff --git a/server.py b/server.py
index 3f5c2b5..f5136e7 100644
--- a/server.py
+++ b/server.py
@@ -19,7 +19,7 @@
 from pyannote.audio import Model
 from pyannote.audio.pipelines import VoiceActivityDetection
 
-HOST = 'localhost'
+HOST = '0.0.0.0'
 PORT = 8765
 SAMPLING_RATE = 16000
 AUDIO_CHANNELS = 1
@@ -123,11 +123,15 @@ async def transcribe_and_send(client_id, websocket, new_audio_data):
         start_time_transcription = time.time()
         
         if client_configs[client_id]['language'] is not None:
+            print("Entering recognition pipeline no language")
             result = recognition_pipeline(file_name, generate_kwargs={"language": client_configs[client_id]['language']})
         else:
+            print("Entering recognition pipeline has language")
             result = recognition_pipeline(file_name)
 
         transcription_time = time.time() - start_time_transcription
+        print("result is ", result)
+
         if DEBUG: print(f"Transcription Time: {transcription_time:.2f} seconds")
 
         print(f"Client ID {client_id}: Transcribed : {result['text']}")

From 5692f514c94a39b23a047b00bc05095c8a097cf1 Mon Sep 17 00:00:00 2001
From: Xu Wenhao <xuwenhao@shoppal.ai>
Date: Tue, 2 Jan 2024 15:21:36 +0000
Subject: [PATCH 04/12] remove useless logs

---
 server.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/server.py b/server.py
index f5136e7..9682f30 100644
--- a/server.py
+++ b/server.py
@@ -123,14 +123,11 @@ async def transcribe_and_send(client_id, websocket, new_audio_data):
         start_time_transcription = time.time()
         
         if client_configs[client_id]['language'] is not None:
-            print("Entering recognition pipeline no language")
             result = recognition_pipeline(file_name, generate_kwargs={"language": client_configs[client_id]['language']})
         else:
-            print("Entering recognition pipeline has language")
             result = recognition_pipeline(file_name)
 
         transcription_time = time.time() - start_time_transcription
-        print("result is ", result)
 
         if DEBUG: print(f"Transcription Time: {transcription_time:.2f} seconds")
 

From e68749c78cc091906eb52ac5da30bef464184d64 Mon Sep 17 00:00:00 2001
From: Xu Wenhao <xuwenhao@shoppal.ai>
Date: Wed, 3 Jan 2024 13:49:15 +0000
Subject: [PATCH 05/12] fix environment issue

---
 .devcontainer/Dockerfile         | 4 ++++
 .devcontainer/docker-compose.yml | 5 +++--
 .gitignore                       | 1 +
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index c12810b..e485bfe 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -31,5 +31,9 @@ RUN echo "source ~/venv/bin/activate" >> ~/.zshrc
 RUN echo "export PYTHONPATH=\$PYTHONPATH:/workspace" >> ~/.zshrc
 RUN /bin/zsh ~/.zshrc
 
+# Setup HF_ENDPOINT and PYANNOTE_CACHE
+ENV HF_ENDPOINT=https://hf-mirror.com
+ENV PYANNOTE_CACHE=/data0/cache/pyannote/
+
 ENV DEBIAN_FRONTEND=dialog
 
diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml
index 856e39c..0e20b7a 100644
--- a/.devcontainer/docker-compose.yml
+++ b/.devcontainer/docker-compose.yml
@@ -5,11 +5,12 @@ services:
     build:
       context: ..
       args:
-        HTTP_PROXY: ${HTTP_PROXY}
-        HTTPS_PROXY: ${HTTPS_PROXY}
+        HTTP_PROXY: http://10.232.14.15:8118
+        HTTPS_PROXY: http://10.232.14.15:8118
       dockerfile: .devcontainer/Dockerfile
     volumes:
       - ~/VoiceStreamAI:/workspace:cached
+      - /data0:/data0:cached
     ports:
       - "8765:8765"
     user: vscode
diff --git a/.gitignore b/.gitignore
index 47fadcd..7700350 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 .env
+.DS_STORE
 SHOPPAL_README.md
 audio_files/*
\ No newline at end of file

From 23ac5dfdf34a728da95b4322489725b50f27a723 Mon Sep 17 00:00:00 2001
From: wangji <wangji@shoppal.ai>
Date: Thu, 4 Jan 2024 13:19:39 +0000
Subject: [PATCH 06/12] select cuda device & en only model

---
 server.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/server.py b/server.py
index 9682f30..5ea1de2 100644
--- a/server.py
+++ b/server.py
@@ -12,6 +12,7 @@
 import wave
 import os
 import time
+import torch
 import logging
 
 from transformers import pipeline
@@ -37,14 +38,16 @@
 
 audio_dir = "audio_files"
 os.makedirs(audio_dir, exist_ok=True)
+device = torch.device("cuda", 1)
 
 ## ---------- INSTANTIATES VAD --------
 model = Model.from_pretrained("pyannote/segmentation", use_auth_token=VAD_AUTH_TOKEN)
-vad_pipeline = VoiceActivityDetection(segmentation=model)
+vad_pipeline = VoiceActivityDetection(segmentation=model, device=device)
 vad_pipeline.instantiate({"onset": 0.5, "offset": 0.5, "min_duration_on": 0.3, "min_duration_off": 0.3})
 
 ## ---------- INSTANTIATES SPEECH --------
-recognition_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
+#recognition_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
+recognition_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-medium.en", device=device)
 
 
 connected_clients = {}
@@ -122,10 +125,10 @@ async def transcribe_and_send(client_id, websocket, new_audio_data):
     if last_segment.end < (len(audio_data) / (SAMPLES_WIDTH * SAMPLING_RATE)) - int(client_configs[client_id]['chunk_offset_seconds']):
         start_time_transcription = time.time()
         
-        if client_configs[client_id]['language'] is not None:
-            result = recognition_pipeline(file_name, generate_kwargs={"language": client_configs[client_id]['language']})
-        else:
-            result = recognition_pipeline(file_name)
+        # if client_configs[client_id]['language'] is not None:
+        #     result = recognition_pipeline(file_name, generate_kwargs={"language": client_configs[client_id]['language']})
+        # else:
+        result = recognition_pipeline(file_name)
 
         transcription_time = time.time() - start_time_transcription
 
@@ -144,6 +147,7 @@ async def transcribe_and_send(client_id, websocket, new_audio_data):
     os.remove(file_name) # in the end always delete the created file
 
 async def receive_audio(websocket, path):
+    print(f"websocket type: {websocket}")
     client_id = str(uuid.uuid4())
     connected_clients[client_id] = websocket
     client_buffers[client_id] = bytearray()
@@ -165,8 +169,9 @@ async def receive_audio(websocket, path):
                 print(f"Unexpected message type from {client_id}")
 
             # Process audio when enough data is received
-            if len(client_buffers[client_id]) > int(client_configs[client_id]['chunk_length_seconds']) * SAMPLING_RATE * SAMPLES_WIDTH:
-                if DEBUG: print(f"Client ID {client_id}: receive_audio calling transcribe_and_send with length: {len(client_buffers[client_id])}")
+            config_buf_size = int(client_configs[client_id]['chunk_length_seconds']) * SAMPLING_RATE * SAMPLES_WIDTH
+            if len(client_buffers[client_id]) > config_buf_size:
+                if DEBUG: print(f"Client ID {client_id}: receive_audio calling transcribe_and_send with length: {len(client_buffers[client_id])}, max length: {config_buf_size}")
                 await transcribe_and_send(client_id, websocket, client_buffers[client_id])
                 client_buffers[client_id].clear()
 

From 43103e8157b05f6ae18fa0354739bc9fc6180b8a Mon Sep 17 00:00:00 2001
From: wangji <wangji@shoppal.ai>
Date: Fri, 5 Jan 2024 09:42:22 +0000
Subject: [PATCH 07/12] asr average latency~0.5s

---
 .devcontainer/devcontainer.json  |  2 +-
 .devcontainer/docker-compose.yml |  2 +-
 .gitignore                       |  5 ++-
 server.py                        | 61 ++++++++++++++++++++------------
 utils/log.py                     | 36 +++++++++++++++++++
 5 files changed, 80 insertions(+), 26 deletions(-)
 create mode 100644 utils/log.py

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index ebb4024..64ea849 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -39,7 +39,7 @@
 
     // Use 'forwardPorts' to make a list of ports inside the container available locally.
     // This can be used to network with other containers or the host.
-    "forwardPorts": [8765],
+    "forwardPorts": [9876],
 
     // Run Args to use GPU
     // "runArgs": ["--gpus", "all"],
diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml
index 0e20b7a..0b406d2 100644
--- a/.devcontainer/docker-compose.yml
+++ b/.devcontainer/docker-compose.yml
@@ -12,7 +12,7 @@ services:
       - ~/VoiceStreamAI:/workspace:cached
       - /data0:/data0:cached
     ports:
-      - "8765:8765"
+      - "9876:9876"
     user: vscode
     command: sleep infinity
     deploy:
diff --git a/.gitignore b/.gitignore
index 7700350..0af100c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,7 @@
 .env
 .DS_STORE
 SHOPPAL_README.md
-audio_files/*
\ No newline at end of file
+audio_files/*
+__pycache__/
+*.ipynb
+.ipynb_checkpoints
\ No newline at end of file
diff --git a/server.py b/server.py
index 5ea1de2..7d720ce 100644
--- a/server.py
+++ b/server.py
@@ -14,14 +14,17 @@
 import time
 import torch
 import logging
-
+import time
 from transformers import pipeline
 from pyannote.core import Segment
 from pyannote.audio import Model
 from pyannote.audio.pipelines import VoiceActivityDetection
+from utils.log import configure_logging
+logger = configure_logging()
+
 
 HOST = '0.0.0.0'
-PORT = 8765
+PORT = 9876
 SAMPLING_RATE = 16000
 AUDIO_CHANNELS = 1
 SAMPLES_WIDTH = 2 # int16
@@ -56,29 +59,30 @@
 client_configs = {}
 # Counter for each client to keep track of file numbers
 file_counters = {}
+recv_time = {}
 
 
 
 async def transcribe_and_send(client_id, websocket, new_audio_data):
     global file_counters
 
-    if DEBUG: print(f"Client ID {client_id}: new_audio_data length in seconds at transcribe_and_send: {float(len(new_audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}")
+    logger.info(f"Client ID {client_id}: new_audio_data length in seconds at transcribe_and_send: {float(len(new_audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}")
 
     # Initialize temporary buffer for new clients
     if client_id not in client_temp_buffers:
         client_temp_buffers[client_id] = bytearray()
 
-    if DEBUG: print(f"Client ID {client_id}: client_temp_buffers[client_id] length in seconds at transcribe_and_send: {float(len(client_temp_buffers[client_id])) / float(SAMPLING_RATE * SAMPLES_WIDTH)}")
+    logger.info(f"Client ID {client_id}: client_temp_buffers[client_id] length in seconds at transcribe_and_send: {float(len(client_temp_buffers[client_id])) / float(SAMPLING_RATE * SAMPLES_WIDTH)}")
 
     # Add new audio data to the temporary buffer
     old_audio_data = bytes(client_temp_buffers[client_id])
 
-    if DEBUG: print(f"Client ID {client_id}: old_audio_data length in seconds at transcribe_and_send: {float(len(old_audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}")
+    logger.info(f"Client ID {client_id}: old_audio_data length in seconds at transcribe_and_send: {float(len(old_audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}")
 
 
     audio_data = old_audio_data + new_audio_data
 
-    if DEBUG: print(f"Client ID {client_id}: audio_data length in seconds at transcribe_and_send: {float(len(audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}")
+    logger.info(f"Client ID {client_id}: audio_data length in seconds at transcribe_and_send: {float(len(audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}")
     
     # Initialize file counter for new clients
     if client_id not in file_counters:
@@ -87,7 +91,7 @@ async def transcribe_and_send(client_id, websocket, new_audio_data):
     # File path
     file_name = f"{audio_dir}/{client_id}_{file_counters[client_id]}.wav"
 
-    if DEBUG: print(f"Client ID {client_id}: Filename : {file_name}")
+    logger.info(f"Client ID {client_id}: Filename : {file_name}")
 
     file_counters[client_id] += 1
 
@@ -104,8 +108,8 @@ async def transcribe_and_send(client_id, websocket, new_audio_data):
     vad_time = time.time() - start_time_vad
 
     # Logging after VAD
-    if DEBUG: print(f"Client ID {client_id}: VAD result segments count: {len(result)}")
-    print(f"Client ID {client_id}: VAD inference time: {vad_time:.2f}")
+    logger.info(f"Client ID {client_id}: VAD result segments count: {len(result)}")
+    logger.info(f"Client ID {client_id}: VAD inference time: {vad_time:.2f}")
 
     if len(result) == 0: # this should happen just if there's no old audio data
         os.remove(file_name)
@@ -119,10 +123,13 @@ async def transcribe_and_send(client_id, websocket, new_audio_data):
     for segment in result.itersegments():
         last_segment = segment
 
-    if DEBUG: print(f"Client ID {client_id}: VAD last Segment end : {last_segment.end}")
+    logger.info(f"Client ID {client_id}: VAD last Segment end : {last_segment.end}")
     
+    accumulated_secs = len(audio_data) / (SAMPLES_WIDTH * SAMPLING_RATE)
     # if the voice ends before chunk_offset_seconds process it all
-    if last_segment.end < (len(audio_data) / (SAMPLES_WIDTH * SAMPLING_RATE)) - int(client_configs[client_id]['chunk_offset_seconds']):
+    timeout_flag = accumulated_secs > 5
+    seg_flag = last_segment.end < accumulated_secs - float(client_configs[client_id]['chunk_offset_seconds'])
+    if timeout_flag or seg_flag :
         start_time_transcription = time.time()
         
         # if client_configs[client_id]['language'] is not None:
@@ -132,58 +139,66 @@ async def transcribe_and_send(client_id, websocket, new_audio_data):
 
         transcription_time = time.time() - start_time_transcription
 
-        if DEBUG: print(f"Transcription Time: {transcription_time:.2f} seconds")
+        logger.info(f"Transcription Time: {transcription_time:.2f} seconds")
 
-        print(f"Client ID {client_id}: Transcribed : {result['text']}")
+        logger.info(f"Client ID {client_id}: Transcribed : {result['text']}")
 
         if result['text']:
-            await websocket.send(result['text'])
+            
+            time_delta = time.time() - recv_time[client_id]
+            time_delta_str = f"|{time_delta:.3f}s|"
+            sep_text = time_delta_str if seg_flag else f"......{time_delta_str}"
+            await websocket.send(result['text'] + sep_text)
             client_temp_buffers[client_id].clear() # Clear temp buffer after processing
     else:
         client_temp_buffers[client_id].clear()
         client_temp_buffers[client_id].extend(audio_data)
-        if DEBUG: print(f"Skipping because {last_segment.end} falls after {(len(audio_data) / (SAMPLES_WIDTH * SAMPLING_RATE)) - int(client_configs[client_id]['chunk_offset_seconds'])}")
+        logger.info(f"Skipping because {last_segment.end} falls after {(len(audio_data) / (SAMPLES_WIDTH * SAMPLING_RATE)) - float(client_configs[client_id]['chunk_offset_seconds'])}")
 
     os.remove(file_name) # in the end always delete the created file
 
 async def receive_audio(websocket, path):
-    print(f"websocket type: {websocket}")
+    logger.info(f"websocket type: {websocket}")
     client_id = str(uuid.uuid4())
     connected_clients[client_id] = websocket
     client_buffers[client_id] = bytearray()
+    recv_time[client_id] = None # recv time list
     client_configs[client_id] = DEFAULT_CLIENT_CONFIG
     
-    print(f"Client {client_id} connected")
+    logger.info(f"Client {client_id} connected")
+    
 
     try:
         async for message in websocket:
             if isinstance(message, bytes):
                 client_buffers[client_id].extend(message)
+                recv_time[client_id] = time.time()
             elif isinstance(message, str):
                 config = json.loads(message)
                 if config.get('type') == 'config':
                     client_configs[client_id] = config['data']
-                    print(f"Config for {client_id}: {client_configs[client_id]}")
+                    logger.info(f"Config for {client_id}: {client_configs[client_id]}")
                     continue
             else:
-                print(f"Unexpected message type from {client_id}")
+                logger.info(f"Unexpected message type from {client_id}")
 
             # Process audio when enough data is received
-            config_buf_size = int(client_configs[client_id]['chunk_length_seconds']) * SAMPLING_RATE * SAMPLES_WIDTH
+            config_buf_size = float(client_configs[client_id]['chunk_length_seconds']) * SAMPLING_RATE * SAMPLES_WIDTH
             if len(client_buffers[client_id]) > config_buf_size:
-                if DEBUG: print(f"Client ID {client_id}: receive_audio calling transcribe_and_send with length: {len(client_buffers[client_id])}, max length: {config_buf_size}")
+                logger.info(f"Client ID {client_id}: receive_audio calling transcribe_and_send with length: {len(client_buffers[client_id])}, max length: {config_buf_size}")
                 await transcribe_and_send(client_id, websocket, client_buffers[client_id])
                 client_buffers[client_id].clear()
+                recv_time[client_id] = list()
 
     except websockets.ConnectionClosed as e:
-        print(f"Connection with {client_id} closed: {e}")
+        logger.info(f"Connection with {client_id} closed: {e}")
     finally:
         del connected_clients[client_id]
         del client_buffers[client_id]
 
 async def main():
     async with websockets.serve(receive_audio, HOST, PORT):
-        print(f"WebSocket server started on ws://{HOST}:{PORT}")
+        logger.info(f"WebSocket server started on ws://{HOST}:{PORT}")
         await asyncio.Future()
 
 if __name__ == "__main__":
diff --git a/utils/log.py b/utils/log.py
new file mode 100644
index 0000000..4260590
--- /dev/null
+++ b/utils/log.py
@@ -0,0 +1,36 @@
+import logging
+
+def configure_logging():
+    # Create a logger
+    logger = logging.getLogger(__name__)
+
+    # Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+    logger.setLevel(logging.INFO)
+
+    # Create a formatter
+    formatter = logging.Formatter('%(asctime)s - %(filename)s:%(lineno)s - %(levelname)s - %(message)s')
+
+    # Create a console handler and set the formatter
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(formatter)
+
+    # Add the console handler to the logger
+    logger.addHandler(console_handler)
+
+    # Optionally, add a file handler to log to a file
+    # file_handler = logging.FileHandler('logfile.log')
+    # file_handler.setFormatter(formatter)
+    # logger.addHandler(file_handler)
+
+    return logger
+
+if __name__ == '__main__':
+    # Configure logging
+    logger = configure_logging()
+
+    # Example log messages
+    logger.debug('This is a debug message')
+    logger.info('This is an info message')
+    logger.warning('This is a warning message')
+    logger.error('This is an error message')
+    logger.critical('This is a critical message')

From cb72dfafdcd59026ade41b6c173ae75f7b682898 Mon Sep 17 00:00:00 2001
From: Xu Wenhao <xuwenhao@shoppal.ai>
Date: Fri, 5 Jan 2024 12:16:41 +0000
Subject: [PATCH 08/12] use a random port and do not bind port in the
 Dockerfile or devcontainer.json

---
 .devcontainer/Dockerfile         |  3 ---
 .devcontainer/devcontainer.json  |  2 +-
 .devcontainer/docker-compose.yml | 13 +++++++------
 server.py                        |  3 ++-
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index e485bfe..a3e9b2c 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -1,8 +1,5 @@
 FROM ghcr.io/shoppal-ai/llm-base
 
-ARG HTTP_PROXY
-ARG HTTPS_PROXY
-
 RUN apt update && apt install -y zsh curl git sudo wget libsndfile1 ffmpeg
 
 # # install python environment
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 64ea849..e52b57f 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -39,7 +39,7 @@
 
     // Use 'forwardPorts' to make a list of ports inside the container available locally.
     // This can be used to network with other containers or the host.
-    "forwardPorts": [9876],
+    // "forwardPorts": [8080],
 
     // Run Args to use GPU
     // "runArgs": ["--gpus", "all"],
diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml
index 0b406d2..add0c59 100644
--- a/.devcontainer/docker-compose.yml
+++ b/.devcontainer/docker-compose.yml
@@ -5,18 +5,19 @@ services:
     build:
       context: ..
       args:
-        HTTP_PROXY: http://10.232.14.15:8118
-        HTTPS_PROXY: http://10.232.14.15:8118
+        http_proxy: ${http_proxy}
+        https_proxy: ${https_proxy}
       dockerfile: .devcontainer/Dockerfile
     volumes:
       - ~/VoiceStreamAI:/workspace:cached
       - /data0:/data0:cached
-    ports:
-      - "9876:9876"
     user: vscode
     command: sleep infinity
-    deploy:
+    deploy:      
       resources:
         reservations:
           devices:
-          - capabilities: [gpu]
\ No newline at end of file
+          - capabilities: [gpu]
+    environment:
+      - http_proxy=${http_proxy}
+      - https_proxy=${https_proxy}
\ No newline at end of file
diff --git a/server.py b/server.py
index 7d720ce..8194df7 100644
--- a/server.py
+++ b/server.py
@@ -15,6 +15,7 @@
 import torch
 import logging
 import time
+import random
 from transformers import pipeline
 from pyannote.core import Segment
 from pyannote.audio import Model
@@ -24,7 +25,7 @@
 
 
 HOST = '0.0.0.0'
-PORT = 9876
+PORT = os.environ.get("SERVER_PORT", random.randint(10000, 11000))
 SAMPLING_RATE = 16000
 AUDIO_CHANNELS = 1
 SAMPLES_WIDTH = 2 # int16

From 5a0a40bb17716aee373246fa5d1c18e209d7a3f2 Mon Sep 17 00:00:00 2001
From: wangji <wangji@shoppal.ai>
Date: Sun, 7 Jan 2024 15:15:25 +0000
Subject: [PATCH 09/12] stream by vad stop

---
 .devcontainer/docker-compose.yml |   4 +-
 requirements-dev.txt             |   3 +
 server.py                        | 212 ++++++++++++++-----------------
 utils/llm.py                     |  18 +++
 4 files changed, 115 insertions(+), 122 deletions(-)
 create mode 100644 utils/llm.py

diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml
index 0b406d2..cd108b9 100644
--- a/.devcontainer/docker-compose.yml
+++ b/.devcontainer/docker-compose.yml
@@ -11,8 +11,8 @@ services:
     volumes:
       - ~/VoiceStreamAI:/workspace:cached
       - /data0:/data0:cached
-    ports:
-      - "9876:9876"
+    # ports:
+      # - "9876:9876"
     user: vscode
     command: sleep infinity
     deploy:
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 4788a7f..cad3105 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -4,3 +4,6 @@ pyannote.audio
 torchvision
 transformers
 websockets
+jupyter
+datasets
+openai
\ No newline at end of file
diff --git a/server.py b/server.py
index 7d720ce..896f8c1 100644
--- a/server.py
+++ b/server.py
@@ -1,10 +1,3 @@
-"""
-VoiceStreamAI Server: Real-time audio transcription using self-hosted Whisper and WebSocket
-
-Contributors:
-- Alessandro Saccoia - alessandro.saccoia@gmail.com
-"""
-
 import asyncio
 import websockets
 import uuid
@@ -14,159 +7,128 @@
 import time
 import torch
 import logging
+import sys
 import time
 from transformers import pipeline
 from pyannote.core import Segment
 from pyannote.audio import Model
 from pyannote.audio.pipelines import VoiceActivityDetection
 from utils.log import configure_logging
+import numpy as np
+import io
+from utils.llm import chat
+import soundfile as sf
+
 logger = configure_logging()
 
 
-HOST = '0.0.0.0'
+HOST = "0.0.0.0"
 PORT = 9876
 SAMPLING_RATE = 16000
 AUDIO_CHANNELS = 1
-SAMPLES_WIDTH = 2 # int16
-DEBUG = True
-VAD_AUTH_TOKEN = os.environ.get("HF_TOKEN") # get your key here -> https://huggingface.co/pyannote/segmentation
+SAMPLES_WIDTH = 2  # int16
+VAD_AUTH_TOKEN = os.environ.get(
+    "HF_TOKEN"
+)  # get your key here -> https://huggingface.co/pyannote/segmentation
 
 DEFAULT_CLIENT_CONFIG = {
-    "language" : None, # multilingual
-    "chunk_length_seconds" : 5,
-    "chunk_offset_seconds" : 1
+    "language": None,  # multilingual
+    "chunk_length_seconds": 2,
+    "chunk_offset_seconds": 0.5,
 }
 
 
-
-audio_dir = "audio_files"
-os.makedirs(audio_dir, exist_ok=True)
 device = torch.device("cuda", 1)
 
 ## ---------- INSTANTIATES VAD --------
 model = Model.from_pretrained("pyannote/segmentation", use_auth_token=VAD_AUTH_TOKEN)
 vad_pipeline = VoiceActivityDetection(segmentation=model, device=device)
-vad_pipeline.instantiate({"onset": 0.5, "offset": 0.5, "min_duration_on": 0.3, "min_duration_off": 0.3})
+vad_pipeline.instantiate(
+    {"onset": 0.5, "offset": 0.5, "min_duration_on": 0.3, "min_duration_off": 0.3}
+)
 
 ## ---------- INSTANTIATES SPEECH --------
-#recognition_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
-recognition_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-medium.en", device=device)
+# recognition_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
+recognition_pipeline = pipeline(
+    "automatic-speech-recognition", model="openai/whisper-medium.en", device=device
+)
 
 
 connected_clients = {}
 client_buffers = {}
 client_temp_buffers = {}
 client_configs = {}
-# Counter for each client to keep track of file numbers
-file_counters = {}
 recv_time = {}
+file_count = 0
 
+async def transcribe_and_send(client_id, websocket):
+    global file_count
+    if client_id in client_temp_buffers:
+        client_temp_buffers[client_id] = client_temp_buffers[client_id] + client_buffers[client_id]
+    else:
+        client_temp_buffers[client_id] = client_buffers[client_id]
 
+    cur_data = client_temp_buffers[client_id]
+    duration = float(len(cur_data)) / (SAMPLES_WIDTH * SAMPLING_RATE)
 
-async def transcribe_and_send(client_id, websocket, new_audio_data):
-    global file_counters
-
-    logger.info(f"Client ID {client_id}: new_audio_data length in seconds at transcribe_and_send: {float(len(new_audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}")
-
-    # Initialize temporary buffer for new clients
-    if client_id not in client_temp_buffers:
-        client_temp_buffers[client_id] = bytearray()
-
-    logger.info(f"Client ID {client_id}: client_temp_buffers[client_id] length in seconds at transcribe_and_send: {float(len(client_temp_buffers[client_id])) / float(SAMPLING_RATE * SAMPLES_WIDTH)}")
-
-    # Add new audio data to the temporary buffer
-    old_audio_data = bytes(client_temp_buffers[client_id])
-
-    logger.info(f"Client ID {client_id}: old_audio_data length in seconds at transcribe_and_send: {float(len(old_audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}")
-
-
-    audio_data = old_audio_data + new_audio_data
-
-    logger.info(f"Client ID {client_id}: audio_data length in seconds at transcribe_and_send: {float(len(audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}")
-    
-    # Initialize file counter for new clients
-    if client_id not in file_counters:
-        file_counters[client_id] = 0
-
-    # File path
-    file_name = f"{audio_dir}/{client_id}_{file_counters[client_id]}.wav"
-
-    logger.info(f"Client ID {client_id}: Filename : {file_name}")
-
-    file_counters[client_id] += 1
-
-    # Save the audio data
-    with wave.open(file_name, 'wb') as wav_file:
-        wav_file.setnchannels(AUDIO_CHANNELS)
-        wav_file.setsampwidth(SAMPLES_WIDTH)
-        wav_file.setframerate(SAMPLING_RATE)
-        wav_file.writeframes(audio_data)
-
-    # Measure VAD time
+    # vad inference
+    numpy_audio = np.frombuffer(cur_data, dtype=np.int16)
+    tensor_audio = torch.tensor(numpy_audio, dtype=torch.float32).view(1, -1)
     start_time_vad = time.time()
-    result = vad_pipeline(file_name)
+    vad_result = vad_pipeline({"waveform":tensor_audio, "sample_rate":SAMPLING_RATE})
     vad_time = time.time() - start_time_vad
+    logger.info(f"Client ID {client_id}: VAD infer time:{vad_time:.2f}, VAD segments: {len(vad_result)}, current audio length: {duration:.2f}s")
 
-    # Logging after VAD
-    logger.info(f"Client ID {client_id}: VAD result segments count: {len(result)}")
-    logger.info(f"Client ID {client_id}: VAD inference time: {vad_time:.2f}")
-
-    if len(result) == 0: # this should happen just if there's no old audio data
-        os.remove(file_name)
-        client_temp_buffers[client_id].clear() 
+    if len(vad_result) == 0:
+        logger.info("drop this segment due to no voice activity found")
+        client_temp_buffers[client_id]= bytearray()
         return
     
-    
-    
-    # Get last recognized segment
-    last_segment = None
-    for segment in result.itersegments():
-        last_segment = segment
-
-    logger.info(f"Client ID {client_id}: VAD last Segment end : {last_segment.end}")
-    
-    accumulated_secs = len(audio_data) / (SAMPLES_WIDTH * SAMPLING_RATE)
-    # if the voice ends before chunk_offset_seconds process it all
-    timeout_flag = accumulated_secs > 5
-    seg_flag = last_segment.end < accumulated_secs - float(client_configs[client_id]['chunk_offset_seconds'])
-    if timeout_flag or seg_flag :
-        start_time_transcription = time.time()
-        
-        # if client_configs[client_id]['language'] is not None:
-        #     result = recognition_pipeline(file_name, generate_kwargs={"language": client_configs[client_id]['language']})
+    end = 0
+    for segment in vad_result.itersegments():
+        # if segment.start - end > client_configs[client_id]['chunk_offset_seconds']:
+        #     # ASR pipeline
+        #     cut_point = int(end * (SAMPLES_WIDTH * SAMPLING_RATE))
+        #     cur_numpy = np.frombuffer(cur_data[:cut_point], dtype=np.int16)
+        #     asr_result = recognition_pipeline(cur_numpy)
+        #     client_buffers[client_id] = client_buffers[client_id][cut_point:]
+        #     if asr_result["text"]:
+        #         question = asr_result['text']
+        #         answer = chat(question)
+        #         await websocket.send(f"Q: {question}  A: {answer}")
+        #     return 
         # else:
-        result = recognition_pipeline(file_name)
-
-        transcription_time = time.time() - start_time_transcription
-
-        logger.info(f"Transcription Time: {transcription_time:.2f} seconds")
-
-        logger.info(f"Client ID {client_id}: Transcribed : {result['text']}")
-
-        if result['text']:
-            
-            time_delta = time.time() - recv_time[client_id]
-            time_delta_str = f"|{time_delta:.3f}s|"
-            sep_text = time_delta_str if seg_flag else f"......{time_delta_str}"
-            await websocket.send(result['text'] + sep_text)
-            client_temp_buffers[client_id].clear() # Clear temp buffer after processing
-    else:
-        client_temp_buffers[client_id].clear()
-        client_temp_buffers[client_id].extend(audio_data)
-        logger.info(f"Skipping because {last_segment.end} falls after {(len(audio_data) / (SAMPLES_WIDTH * SAMPLING_RATE)) - float(client_configs[client_id]['chunk_offset_seconds'])}")
+        end = segment.end
+    if duration - end > client_configs[client_id]['chunk_offset_seconds']:
+        cut_point = int(end * SAMPLING_RATE) * SAMPLES_WIDTH
+        logger.info(f"buffer size: {len(cur_data)}, cut_point: {cut_point}")
+        cur_numpy = np.frombuffer(cur_data[:cut_point], dtype=np.int16)
+        asr_result = recognition_pipeline(cur_numpy)
+        client_temp_buffers[client_id] = cur_data[cut_point:]
+        if asr_result["text"]:
+            file_count += 1
+            question = asr_result['text']
+            file_name = os.path.join('audio_files', f"{question}_{file_count}.wav")
+            with wave.open(file_name, 'wb') as wav_file:
+                wav_file.setnchannels(AUDIO_CHANNELS)
+                wav_file.setsampwidth(SAMPLES_WIDTH)
+                wav_file.setframerate(SAMPLING_RATE)
+                wav_file.writeframes(cur_data[:cut_point])
+            answer = chat(question)
+            await websocket.send(f"Q: {question}  A: {answer}")
+        return 
+    
 
-    os.remove(file_name) # in the end always delete the created file
 
 async def receive_audio(websocket, path):
     logger.info(f"websocket type: {websocket}")
     client_id = str(uuid.uuid4())
     connected_clients[client_id] = websocket
     client_buffers[client_id] = bytearray()
-    recv_time[client_id] = None # recv time list
+    recv_time[client_id] = None  # recv time list
     client_configs[client_id] = DEFAULT_CLIENT_CONFIG
-    
+
     logger.info(f"Client {client_id} connected")
-    
 
     try:
         async for message in websocket:
@@ -174,21 +136,29 @@ async def receive_audio(websocket, path):
                 client_buffers[client_id].extend(message)
                 recv_time[client_id] = time.time()
             elif isinstance(message, str):
-                config = json.loads(message)
-                if config.get('type') == 'config':
-                    client_configs[client_id] = config['data']
-                    logger.info(f"Config for {client_id}: {client_configs[client_id]}")
-                    continue
+                # config = json.loads(message)
+                # if config.get("type") == "config":
+                #     client_configs[client_id] = config["data"]
+                #     logger.info(f"Config for {client_id}: {client_configs[client_id]}")
+                continue
             else:
                 logger.info(f"Unexpected message type from {client_id}")
 
             # Process audio when enough data is received
-            config_buf_size = float(client_configs[client_id]['chunk_length_seconds']) * SAMPLING_RATE * SAMPLES_WIDTH
+            config_buf_size = (
+                float(client_configs[client_id]["chunk_length_seconds"])
+                * SAMPLING_RATE
+                * SAMPLES_WIDTH
+            )
             if len(client_buffers[client_id]) > config_buf_size:
-                logger.info(f"Client ID {client_id}: receive_audio calling transcribe_and_send with length: {len(client_buffers[client_id])}, max length: {config_buf_size}")
-                await transcribe_and_send(client_id, websocket, client_buffers[client_id])
+                logger.info(
+                    f"Client ID {client_id}: receive_audio calling transcribe_and_send with length: {len(client_buffers[client_id])}, max length: {config_buf_size}"
+                )
+                await transcribe_and_send(
+                    client_id, websocket
+                )
                 client_buffers[client_id].clear()
-                recv_time[client_id] = list()
+                recv_time[client_id] = None
 
     except websockets.ConnectionClosed as e:
         logger.info(f"Connection with {client_id} closed: {e}")
@@ -196,10 +166,12 @@ async def receive_audio(websocket, path):
         del connected_clients[client_id]
         del client_buffers[client_id]
 
+
 async def main():
     async with websockets.serve(receive_audio, HOST, PORT):
         logger.info(f"WebSocket server started on ws://{HOST}:{PORT}")
         await asyncio.Future()
 
+
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/utils/llm.py b/utils/llm.py
new file mode 100644
index 0000000..866e387
--- /dev/null
+++ b/utils/llm.py
@@ -0,0 +1,18 @@
+import openai
+import os
+
+
+client = openai.Client(api_key="fake_key", base_url="http://vllm:8000/v1/")
+
+def chat(text):
+    return f"what do you mean by {text}"
+
+    # response = client.chat.completions.create(
+    #         model="/data0/model_output/shoppal-test/dreampal",
+    #         messages= [{"role": "system", "content": "You are now a dream interpretation expert. Please analyze the description of the dream that I input."},
+    #                    {"role": "user", "content": text }],
+    #         # response_format={ "type": "json_object" },
+    #         stream=False
+    #     )
+
+    # return response
\ No newline at end of file

From 99ede15e70676d7ed546e5b401b3d3dd21330b2c Mon Sep 17 00:00:00 2001
From: wangji <wangji@shoppal.ai>
Date: Mon, 8 Jan 2024 03:13:35 +0000
Subject: [PATCH 10/12] build with proxy

---
 .devcontainer/.env               | 4 ++++
 .devcontainer/docker-compose.yml | 4 ++--
 .env.example                     | 4 ----
 .gitignore                       | 1 -
 4 files changed, 6 insertions(+), 7 deletions(-)
 create mode 100644 .devcontainer/.env
 delete mode 100644 .env.example

diff --git a/.devcontainer/.env b/.devcontainer/.env
new file mode 100644
index 0000000..4ef9382
--- /dev/null
+++ b/.devcontainer/.env
@@ -0,0 +1,4 @@
+http_proxy=http://10.232.14.15:8118
+https_proxy=http://10.232.14.15:8118
+HF_ENDPOINT=https://hf-mirror.com
+HF_TOKEN=
\ No newline at end of file
diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml
index add0c59..5fc7fe7 100644
--- a/.devcontainer/docker-compose.yml
+++ b/.devcontainer/docker-compose.yml
@@ -3,11 +3,11 @@ version: '3.8'
 services:
   app:
     build:
-      context: ..
+      context: .
       args:
         http_proxy: ${http_proxy}
         https_proxy: ${https_proxy}
-      dockerfile: .devcontainer/Dockerfile
+      dockerfile: Dockerfile
     volumes:
       - ~/VoiceStreamAI:/workspace:cached
       - /data0:/data0:cached
diff --git a/.env.example b/.env.example
deleted file mode 100644
index 136fcc3..0000000
--- a/.env.example
+++ /dev/null
@@ -1,4 +0,0 @@
-HTTP_PROXY=
-HTTPS_PROXY=
-HF_ENDPOINT=https://hf-mirror.com
-HF_TOKEN=
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 0af100c..4da0aa2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
-.env
 .DS_STORE
 SHOPPAL_README.md
 audio_files/*

From b9df2ba36b87200437297e5db55e029b30fba3f1 Mon Sep 17 00:00:00 2001
From: wangji <wangji@shoppal.ai>
Date: Mon, 8 Jan 2024 03:23:04 +0000
Subject: [PATCH 11/12] test

---
 test.py | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 test.py

diff --git a/test.py b/test.py
new file mode 100644
index 0000000..b32718f
--- /dev/null
+++ b/test.py
@@ -0,0 +1,44 @@
+import openai
+
+import os
+
+api_key = "fake_key"
+api_base = "http://vllm:8000/v1/"
+client = openai.Client(api_key=api_key, base_url=api_base)
+
+model_name = "/data0/model_output/shoppal-test/dreampal"
+
+def predict(message, history, system_prompt):
+    history_openai_format = []
+    history_openai_format.append({"role": "system", "content": system_prompt})
+    for human, assistant in history:
+        history_openai_format.append({"role": "user", "content": human })
+        history_openai_format.append({"role": "assistant", "content":assistant})
+    history_openai_format.append({"role": "user", "content": message})
+
+    response = client.chat.completions.create(
+        model=model_name,
+        messages= history_openai_format,
+        # response_format={ "type": "json_object" },
+        stream=True
+    )
+
+    partial_message = ""
+    for chunk in response:
+        if chunk.choices[0].delta.content and len(chunk.choices[0].delta.content) != 0:
+            partial_message = partial_message + chunk.choices[0].delta.content
+            yield partial_message
+
+system_prompt = """
+You are now a dream interpretation expert. Please analyze the description of the dream that I input.
+"""
+
+response = client.chat.completions.create(
+        model=model_name,
+        messages= [{"role": "system", "content": system_prompt},
+                   {"role": "user", "content": "hello" }],
+        #sresponse_format={ "type": "json_object" },
+        #stream=False
+    )
+
+#print(response.choices[0].message)
\ No newline at end of file

From 7c2fddac1ad7cbebe91f50824168680bf7b9c856 Mon Sep 17 00:00:00 2001
From: wangji <wangji@shoppal.ai>
Date: Mon, 8 Jan 2024 07:27:12 +0000
Subject: [PATCH 12/12] add llm;

---
 server.py    | 19 ++++++++++---------
 utils/llm.py | 22 ++++++++++++----------
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/server.py b/server.py
index 231de44..1737bdf 100644
--- a/server.py
+++ b/server.py
@@ -34,8 +34,8 @@
 
 DEFAULT_CLIENT_CONFIG = {
     "language": None,  # multilingual
-    "chunk_length_seconds": 2,
-    "chunk_offset_seconds": 0.5,
+    "chunk_length_seconds": 5,
+    "chunk_offset_seconds": 1,
 }
 
 
@@ -51,7 +51,7 @@
 ## ---------- INSTANTIATES SPEECH --------
 # recognition_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
 recognition_pipeline = pipeline(
-    "automatic-speech-recognition", model="openai/whisper-medium.en", device=device
+    "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
 )
 
 
@@ -104,29 +104,30 @@ async def transcribe_and_send(client_id, websocket):
         cut_point = int(end * SAMPLING_RATE) * SAMPLES_WIDTH
         logger.info(f"buffer size: {len(cur_data)}, cut_point: {cut_point}")
         cur_numpy = np.frombuffer(cur_data[:cut_point], dtype=np.int16)
-        asr_result = recognition_pipeline(cur_numpy)
+        asr_result = recognition_pipeline({"sampling_rate":16000, "raw":cur_numpy})
         client_temp_buffers[client_id] = cur_data[cut_point:]
         if asr_result["text"]:
             file_count += 1
-            question = asr_result['text']
+            question = asr_result['text'] + f"...|{time.time()-recv_time[client_id]:.3f}s|"
+            await websocket.send(question)
             file_name = os.path.join('audio_files', f"{question}_{file_count}.wav")
             with wave.open(file_name, 'wb') as wav_file:
                 wav_file.setnchannels(AUDIO_CHANNELS)
                 wav_file.setsampwidth(SAMPLES_WIDTH)
                 wav_file.setframerate(SAMPLING_RATE)
                 wav_file.writeframes(cur_data[:cut_point])
-            answer = chat(question)
-            await websocket.send(f"Q: {question}  A: {answer}")
+            answer = chat(asr_result['text']) + f"...|{time.time()-recv_time[client_id]:.3f}s|"
+            await websocket.send(answer)
         return 
     
 
 
 async def receive_audio(websocket, path):
+    global recv_time
     logger.info(f"websocket type: {websocket}")
     client_id = str(uuid.uuid4())
     connected_clients[client_id] = websocket
     client_buffers[client_id] = bytearray()
-    recv_time[client_id] = None  # recv time list
     client_configs[client_id] = DEFAULT_CLIENT_CONFIG
 
     logger.info(f"Client {client_id} connected")
@@ -159,7 +160,7 @@ async def receive_audio(websocket, path):
                     client_id, websocket
                 )
                 client_buffers[client_id].clear()
-                recv_time[client_id] = None
+                
 
     except websockets.ConnectionClosed as e:
         logger.info(f"Connection with {client_id} closed: {e}")
diff --git a/utils/llm.py b/utils/llm.py
index 866e387..441075f 100644
--- a/utils/llm.py
+++ b/utils/llm.py
@@ -2,17 +2,19 @@
 import os
 
 
-client = openai.Client(api_key="fake_key", base_url="http://vllm:8000/v1/")
+client = openai.Client(api_key="fake_key", base_url="http://10.232.14.16:8000/v1/")
 
 def chat(text):
-    return f"what do you mean by {text}"
+    response = client.chat.completions.create(
+            model="/data0/models/huggingface/meta-llama/Llama-2-7b-chat-hf/",
+            messages= [{"role": "system", "content": "you are a usefull agent and try to answer each question within 15 words"},
+                       {"role": "user", "content": text }],
+            # response_format={ "type": "json_object" },
+            #stream=False
+        )
 
-    # response = client.chat.completions.create(
-    #         model="/data0/model_output/shoppal-test/dreampal",
-    #         messages= [{"role": "system", "content": "You are now a dream interpretation expert. Please analyze the description of the dream that I input."},
-    #                    {"role": "user", "content": text }],
-    #         # response_format={ "type": "json_object" },
-    #         stream=False
-    #     )
+    return response.choices[0].message.content
 
-    # return response
\ No newline at end of file
+if __name__ == '__main__':
+    ret = chat("hello")
+    print(ret)
\ No newline at end of file