From b4a507d79819725921c2fccbc90ffec9319480bb Mon Sep 17 00:00:00 2001 From: Xu Wenhao Date: Tue, 2 Jan 2024 10:27:37 +0000 Subject: [PATCH 01/12] add dev container settings --- .devcontainer/Dockerfile | 29 +++++++++++++++++ .devcontainer/devcontainer.json | 52 ++++++++++++++++++++++++++++++ .devcontainer/docker-compose.yml | 13 ++++++++ .devcontainer/postCreateCommand.sh | 6 ++++ .gitignore | 1 + requirements-dev.txt | 4 +++ server.py | 2 +- 7 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 .devcontainer/Dockerfile create mode 100644 .devcontainer/devcontainer.json create mode 100644 .devcontainer/docker-compose.yml create mode 100755 .devcontainer/postCreateCommand.sh create mode 100644 .gitignore create mode 100644 requirements-dev.txt diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000..0ad824b --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,29 @@ +FROM python:3.8 + +RUN pip install --no-cache-dir --upgrade pip +RUN apt update && apt install -y zsh curl git sudo wget libsndfile1 + +ARG USERNAME=vscode +ARG USER_UID=1000 +ARG USER_GID=$USER_UID + +RUN groupadd --gid $USER_GID $USERNAME \ + && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME + +RUN usermod -aG sudo $USERNAME +RUN echo 'vscode ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + +USER $USERNAME + +RUN cd ~ && wget https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh && sh install.sh + +# Create and activate a Python virtual environment +RUN python -m venv ~/venv +RUN echo "source ~/venv/bin/activate" >> ~/.zshrc + +# Set Python path in the virtual environment. +RUN echo "export PYTHONPATH=\$PYTHONPATH:/workspace" >> ~/.zshrc +RUN /bin/zsh ~/.zshrc + +ENV DEBIAN_FRONTEND=dialog + diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..0ae1534 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,52 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/postgres +{ + "name": "VoiceStreamAI Demo", + "dockerComposeFile": "docker-compose.yml", + "service": "app", + "workspaceFolder": "/workspace", + "customizations": { + "vscode": { + "settings": { + "terminal.integrated.defaultProfile.linux": "zsh", + "terminal.integrated.profiles.linux": { + "zsh": { + "path": "/bin/zsh" + } + } + }, + "extensions": [ + "GitHub.copilot", + "GitHub.copilot-labs", + "GitHub.vscode-pull-request-github", + "ms-python.python", + "ms-python.vscode-pylance", + "ms-python.pylint", + "ms-python.isort", + "ms-python.black-formatter", + "matangover.mypy", + "ms-toolsai.jupyter", + "ms-toolsai.jupyter-keymap", + "ms-toolsai.vscode-jupyter-slideshow", + "eamodio.gitlens", + "github.vscode-github-actions" + ] + } + }, + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // This can be used to network with other containers or the host. + "forwardPorts": [8765], + + // Use 'postCreateCommand' to run commands after the container is created. + "postCreateCommand": "./.devcontainer/postCreateCommand.sh" + + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" +} + diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml new file mode 100644 index 0000000..2c7b43a --- /dev/null +++ b/.devcontainer/docker-compose.yml @@ -0,0 +1,13 @@ +version: '3.8' + +services: + app: + build: + context: .. + dockerfile: .devcontainer/Dockerfile + volumes: + - ..:/workspace + ports: + - "8765:8765" + user: vscode + command: sleep infinity \ No newline at end of file diff --git a/.devcontainer/postCreateCommand.sh b/.devcontainer/postCreateCommand.sh new file mode 100755 index 0000000..a3dcd06 --- /dev/null +++ b/.devcontainer/postCreateCommand.sh @@ -0,0 +1,6 @@ +#!/bin/bash +source /home/vscode/venv/bin/activate +pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +pip install -r requirements.txt +pip install -r requirements-dev.txt +sudo chown vscode:vscode /workspace diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4c49bd7 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.env diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..0f9b57a --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,4 @@ +transformers +pyannote.core +pyannote.audio +websockets diff --git a/server.py b/server.py index 6f3a7b0..3f5c2b5 100644 --- a/server.py +++ b/server.py @@ -25,7 +25,7 @@ AUDIO_CHANNELS = 1 SAMPLES_WIDTH = 2 # int16 DEBUG = True -VAD_AUTH_TOKEN = "FILL ME" # get your key here -> https://huggingface.co/pyannote/segmentation +VAD_AUTH_TOKEN = os.environ.get("HF_TOKEN") # get your key here -> https://huggingface.co/pyannote/segmentation DEFAULT_CLIENT_CONFIG = { "language" : None, # multilingual From 9564d1996b5c2ce1663226166d2784823d4e2bfb Mon Sep 17 00:00:00 2001 From: Xu Wenhao Date: Tue, 2 Jan 2024 14:34:35 +0000 Subject: [PATCH 02/12] basic dev container setup --- .devcontainer/Dockerfile | 14 ++++++++++---- .devcontainer/devcontainer.json | 6 +++++- .devcontainer/docker-compose.yml | 12 ++++++++++-- .devcontainer/postCreateCommand.sh | 2 +- .env.example | 4 ++++ .gitignore | 1 + requirements-dev.txt | 1 + 7 files changed, 32 insertions(+), 8 deletions(-) create mode 100644 .env.example diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 0ad824b..d344f52 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,8 +1,15 @@ -FROM python:3.8 +FROM ghcr.io/shoppal-ai/llm-base + +ARG HTTP_PROXY +ARG HTTPS_PROXY -RUN pip install --no-cache-dir --upgrade pip RUN apt update && apt install -y zsh curl git sudo wget libsndfile1 +# # install python environment +# RUN apt install -y python3.8-dev python3-pip +# RUN update-alternatives --install /usr/bin/pthon python /usr/bin/python3.10 1 +RUN apt install -y python3.10-venv + ARG USERNAME=vscode ARG USER_UID=1000 ARG USER_GID=$USER_UID @@ -17,8 +24,7 @@ USER $USERNAME RUN cd ~ && wget https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh && sh install.sh -# Create and activate a Python virtual environment -RUN python -m venv ~/venv +RUN python3 -m venv ~/venv RUN echo "source ~/venv/bin/activate" >> ~/.zshrc # Set Python path in the virtual environment. diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 0ae1534..ebb4024 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -4,6 +4,7 @@ "name": "VoiceStreamAI Demo", "dockerComposeFile": "docker-compose.yml", "service": "app", + "workspaceMount": "source=~/VoiceStreamAI,target=/workspace,type=bind,consistency=cached", "workspaceFolder": "/workspace", "customizations": { "vscode": { @@ -40,13 +41,16 @@ // This can be used to network with other containers or the host. "forwardPorts": [8765], + // Run Args to use GPU + // "runArgs": ["--gpus", "all"], // Use 'postCreateCommand' to run commands after the container is created. "postCreateCommand": "./.devcontainer/postCreateCommand.sh" // Configure tool-specific properties. // "customizations": {}, + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. - // "remoteUser": "root" + // "remoteUser": "root", } diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 2c7b43a..856e39c 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -4,10 +4,18 @@ services: app: build: context: .. + args: + HTTP_PROXY: ${HTTP_PROXY} + HTTPS_PROXY: ${HTTPS_PROXY} dockerfile: .devcontainer/Dockerfile volumes: - - ..:/workspace + - ~/VoiceStreamAI:/workspace:cached ports: - "8765:8765" user: vscode - command: sleep infinity \ No newline at end of file + command: sleep infinity + deploy: + resources: + reservations: + devices: + - capabilities: [gpu] \ No newline at end of file diff --git a/.devcontainer/postCreateCommand.sh b/.devcontainer/postCreateCommand.sh index a3dcd06..7109689 100755 --- a/.devcontainer/postCreateCommand.sh +++ b/.devcontainer/postCreateCommand.sh @@ -3,4 +3,4 @@ source /home/vscode/venv/bin/activate pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple pip install -r requirements.txt pip install -r requirements-dev.txt -sudo chown vscode:vscode /workspace +sudo chown vscode:vscode /workspace \ No newline at end of file diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..136fcc3 --- /dev/null +++ b/.env.example @@ -0,0 +1,4 @@ +HTTP_PROXY= +HTTPS_PROXY= +HF_ENDPOINT=https://hf-mirror.com +HF_TOKEN= \ No newline at end of file diff --git a/.gitignore b/.gitignore index 4c49bd7..159be9e 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ .env +SHOPPAL_README.md \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt index 0f9b57a..00b23dd 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,5 @@ transformers pyannote.core pyannote.audio +torchvision websockets From 9244bfc3f0525432fef167a6ad9892eaaf324cb5 Mon Sep 17 00:00:00 2001 From: Xu Wenhao Date: Tue, 2 Jan 2024 15:19:05 +0000 Subject: [PATCH 03/12] now it works in dev container --- .devcontainer/Dockerfile | 2 +- .gitignore | 3 ++- requirements-dev.txt | 3 ++- server.py | 6 +++++- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index d344f52..c12810b 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -3,7 +3,7 @@ FROM ghcr.io/shoppal-ai/llm-base ARG HTTP_PROXY ARG HTTPS_PROXY -RUN apt update && apt install -y zsh curl git sudo wget libsndfile1 +RUN apt update && apt install -y zsh curl git sudo wget libsndfile1 ffmpeg # # install python environment # RUN apt install -y python3.8-dev python3-pip diff --git a/.gitignore b/.gitignore index 159be9e..47fadcd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .env -SHOPPAL_README.md \ No newline at end of file +SHOPPAL_README.md +audio_files/* \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt index 00b23dd..4788a7f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,6 @@ -transformers +ffmpeg pyannote.core pyannote.audio torchvision +transformers websockets diff --git a/server.py b/server.py index 3f5c2b5..f5136e7 100644 --- a/server.py +++ b/server.py @@ -19,7 +19,7 @@ from pyannote.audio import Model from pyannote.audio.pipelines import VoiceActivityDetection -HOST = 'localhost' +HOST = '0.0.0.0' PORT = 8765 SAMPLING_RATE = 16000 AUDIO_CHANNELS = 1 @@ -123,11 +123,15 @@ async def transcribe_and_send(client_id, websocket, new_audio_data): start_time_transcription = time.time() if client_configs[client_id]['language'] is not None: + print("Entering recognition pipeline no language") result = recognition_pipeline(file_name, generate_kwargs={"language": client_configs[client_id]['language']}) else: + print("Entering recognition pipeline has language") result = recognition_pipeline(file_name) transcription_time = time.time() - start_time_transcription + print("result is ", result) + if DEBUG: print(f"Transcription Time: {transcription_time:.2f} seconds") print(f"Client ID {client_id}: Transcribed : {result['text']}") From 5692f514c94a39b23a047b00bc05095c8a097cf1 Mon Sep 17 00:00:00 2001 From: Xu Wenhao Date: Tue, 2 Jan 2024 15:21:36 +0000 Subject: [PATCH 04/12] remove useless logs --- server.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/server.py b/server.py index f5136e7..9682f30 100644 --- a/server.py +++ b/server.py @@ -123,14 +123,11 @@ async def transcribe_and_send(client_id, websocket, new_audio_data): start_time_transcription = time.time() if client_configs[client_id]['language'] is not None: - print("Entering recognition pipeline no language") result = recognition_pipeline(file_name, generate_kwargs={"language": client_configs[client_id]['language']}) else: - print("Entering recognition pipeline has language") result = recognition_pipeline(file_name) transcription_time = time.time() - start_time_transcription - print("result is ", result) if DEBUG: print(f"Transcription Time: {transcription_time:.2f} seconds") From e68749c78cc091906eb52ac5da30bef464184d64 Mon Sep 17 00:00:00 2001 From: Xu Wenhao Date: Wed, 3 Jan 2024 13:49:15 +0000 Subject: [PATCH 05/12] fix environment issue --- .devcontainer/Dockerfile | 4 ++++ .devcontainer/docker-compose.yml | 5 +++-- .gitignore | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index c12810b..e485bfe 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -31,5 +31,9 @@ RUN echo "source ~/venv/bin/activate" >> ~/.zshrc RUN echo "export PYTHONPATH=\$PYTHONPATH:/workspace" >> ~/.zshrc RUN /bin/zsh ~/.zshrc +# Setup HF_ENDPOINT and PYANNOTE_CACHE +ENV HF_ENDPOINT=https://hf-mirror.com +ENV PYANNOTE_CACHE=/data0/cache/pyannote/ + ENV DEBIAN_FRONTEND=dialog diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 856e39c..0e20b7a 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -5,11 +5,12 @@ services: build: context: .. args: - HTTP_PROXY: ${HTTP_PROXY} - HTTPS_PROXY: ${HTTPS_PROXY} + HTTP_PROXY: http://10.232.14.15:8118 + HTTPS_PROXY: http://10.232.14.15:8118 dockerfile: .devcontainer/Dockerfile volumes: - ~/VoiceStreamAI:/workspace:cached + - /data0:/data0:cached ports: - "8765:8765" user: vscode diff --git a/.gitignore b/.gitignore index 47fadcd..7700350 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .env +.DS_STORE SHOPPAL_README.md audio_files/* \ No newline at end of file From 23ac5dfdf34a728da95b4322489725b50f27a723 Mon Sep 17 00:00:00 2001 From: wangji Date: Thu, 4 Jan 2024 13:19:39 +0000 Subject: [PATCH 06/12] select cuda device & en only model --- server.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/server.py b/server.py index 9682f30..5ea1de2 100644 --- a/server.py +++ b/server.py @@ -12,6 +12,7 @@ import wave import os import time +import torch import logging from transformers import pipeline @@ -37,14 +38,16 @@ audio_dir = "audio_files" os.makedirs(audio_dir, exist_ok=True) +device = torch.device("cuda", 1) ## ---------- INSTANTIATES VAD -------- model = Model.from_pretrained("pyannote/segmentation", use_auth_token=VAD_AUTH_TOKEN) -vad_pipeline = VoiceActivityDetection(segmentation=model) +vad_pipeline = VoiceActivityDetection(segmentation=model, device=device) vad_pipeline.instantiate({"onset": 0.5, "offset": 0.5, "min_duration_on": 0.3, "min_duration_off": 0.3}) ## ---------- INSTANTIATES SPEECH -------- -recognition_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3") +#recognition_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3") +recognition_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-medium.en", device=device) connected_clients = {} @@ -122,10 +125,10 @@ async def transcribe_and_send(client_id, websocket, new_audio_data): if last_segment.end < (len(audio_data) / (SAMPLES_WIDTH * SAMPLING_RATE)) - int(client_configs[client_id]['chunk_offset_seconds']): start_time_transcription = time.time() - if client_configs[client_id]['language'] is not None: - result = recognition_pipeline(file_name, generate_kwargs={"language": client_configs[client_id]['language']}) - else: - result = recognition_pipeline(file_name) + # if client_configs[client_id]['language'] is not None: + # result = recognition_pipeline(file_name, generate_kwargs={"language": client_configs[client_id]['language']}) + # else: + result = recognition_pipeline(file_name) transcription_time = time.time() - start_time_transcription @@ -144,6 +147,7 @@ async def transcribe_and_send(client_id, websocket, new_audio_data): os.remove(file_name) # in the end always delete the created file async def receive_audio(websocket, path): + print(f"websocket type: {websocket}") client_id = str(uuid.uuid4()) connected_clients[client_id] = websocket client_buffers[client_id] = bytearray() @@ -165,8 +169,9 @@ async def receive_audio(websocket, path): print(f"Unexpected message type from {client_id}") # Process audio when enough data is received - if len(client_buffers[client_id]) > int(client_configs[client_id]['chunk_length_seconds']) * SAMPLING_RATE * SAMPLES_WIDTH: - if DEBUG: print(f"Client ID {client_id}: receive_audio calling transcribe_and_send with length: {len(client_buffers[client_id])}") + config_buf_size = int(client_configs[client_id]['chunk_length_seconds']) * SAMPLING_RATE * SAMPLES_WIDTH + if len(client_buffers[client_id]) > config_buf_size: + if DEBUG: print(f"Client ID {client_id}: receive_audio calling transcribe_and_send with length: {len(client_buffers[client_id])}, max length: {config_buf_size}") await transcribe_and_send(client_id, websocket, client_buffers[client_id]) client_buffers[client_id].clear() From 43103e8157b05f6ae18fa0354739bc9fc6180b8a Mon Sep 17 00:00:00 2001 From: wangji Date: Fri, 5 Jan 2024 09:42:22 +0000 Subject: [PATCH 07/12] asr average latency~0.5s --- .devcontainer/devcontainer.json | 2 +- .devcontainer/docker-compose.yml | 2 +- .gitignore | 5 ++- server.py | 61 ++++++++++++++++++++------------ utils/log.py | 36 +++++++++++++++++++ 5 files changed, 80 insertions(+), 26 deletions(-) create mode 100644 utils/log.py diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ebb4024..64ea849 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -39,7 +39,7 @@ // Use 'forwardPorts' to make a list of ports inside the container available locally. // This can be used to network with other containers or the host. - "forwardPorts": [8765], + "forwardPorts": [9876], // Run Args to use GPU // "runArgs": ["--gpus", "all"], diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 0e20b7a..0b406d2 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -12,7 +12,7 @@ services: - ~/VoiceStreamAI:/workspace:cached - /data0:/data0:cached ports: - - "8765:8765" + - "9876:9876" user: vscode command: sleep infinity deploy: diff --git a/.gitignore b/.gitignore index 7700350..0af100c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ .env .DS_STORE SHOPPAL_README.md -audio_files/* \ No newline at end of file +audio_files/* +__pycache__/ +*.ipynb +.ipynb_checkpoints \ No newline at end of file diff --git a/server.py b/server.py index 5ea1de2..7d720ce 100644 --- a/server.py +++ b/server.py @@ -14,14 +14,17 @@ import time import torch import logging - +import time from transformers import pipeline from pyannote.core import Segment from pyannote.audio import Model from pyannote.audio.pipelines import VoiceActivityDetection +from utils.log import configure_logging +logger = configure_logging() + HOST = '0.0.0.0' -PORT = 8765 +PORT = 9876 SAMPLING_RATE = 16000 AUDIO_CHANNELS = 1 SAMPLES_WIDTH = 2 # int16 @@ -56,29 +59,30 @@ client_configs = {} # Counter for each client to keep track of file numbers file_counters = {} +recv_time = {} async def transcribe_and_send(client_id, websocket, new_audio_data): global file_counters - if DEBUG: print(f"Client ID {client_id}: new_audio_data length in seconds at transcribe_and_send: {float(len(new_audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}") + logger.info(f"Client ID {client_id}: new_audio_data length in seconds at transcribe_and_send: {float(len(new_audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}") # Initialize temporary buffer for new clients if client_id not in client_temp_buffers: client_temp_buffers[client_id] = bytearray() - if DEBUG: print(f"Client ID {client_id}: client_temp_buffers[client_id] length in seconds at transcribe_and_send: {float(len(client_temp_buffers[client_id])) / float(SAMPLING_RATE * SAMPLES_WIDTH)}") + logger.info(f"Client ID {client_id}: client_temp_buffers[client_id] length in seconds at transcribe_and_send: {float(len(client_temp_buffers[client_id])) / float(SAMPLING_RATE * SAMPLES_WIDTH)}") # Add new audio data to the temporary buffer old_audio_data = bytes(client_temp_buffers[client_id]) - if DEBUG: print(f"Client ID {client_id}: old_audio_data length in seconds at transcribe_and_send: {float(len(old_audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}") + logger.info(f"Client ID {client_id}: old_audio_data length in seconds at transcribe_and_send: {float(len(old_audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}") audio_data = old_audio_data + new_audio_data - if DEBUG: print(f"Client ID {client_id}: audio_data length in seconds at transcribe_and_send: {float(len(audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}") + logger.info(f"Client ID {client_id}: audio_data length in seconds at transcribe_and_send: {float(len(audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}") # Initialize file counter for new clients if client_id not in file_counters: @@ -87,7 +91,7 @@ async def transcribe_and_send(client_id, websocket, new_audio_data): # File path file_name = f"{audio_dir}/{client_id}_{file_counters[client_id]}.wav" - if DEBUG: print(f"Client ID {client_id}: Filename : {file_name}") + logger.info(f"Client ID {client_id}: Filename : {file_name}") file_counters[client_id] += 1 @@ -104,8 +108,8 @@ async def transcribe_and_send(client_id, websocket, new_audio_data): vad_time = time.time() - start_time_vad # Logging after VAD - if DEBUG: print(f"Client ID {client_id}: VAD result segments count: {len(result)}") - print(f"Client ID {client_id}: VAD inference time: {vad_time:.2f}") + logger.info(f"Client ID {client_id}: VAD result segments count: {len(result)}") + logger.info(f"Client ID {client_id}: VAD inference time: {vad_time:.2f}") if len(result) == 0: # this should happen just if there's no old audio data os.remove(file_name) @@ -119,10 +123,13 @@ async def transcribe_and_send(client_id, websocket, new_audio_data): for segment in result.itersegments(): last_segment = segment - if DEBUG: print(f"Client ID {client_id}: VAD last Segment end : {last_segment.end}") + logger.info(f"Client ID {client_id}: VAD last Segment end : {last_segment.end}") + accumulated_secs = len(audio_data) / (SAMPLES_WIDTH * SAMPLING_RATE) # if the voice ends before chunk_offset_seconds process it all - if last_segment.end < (len(audio_data) / (SAMPLES_WIDTH * SAMPLING_RATE)) - int(client_configs[client_id]['chunk_offset_seconds']): + timeout_flag = accumulated_secs > 5 + seg_flag = last_segment.end < accumulated_secs - float(client_configs[client_id]['chunk_offset_seconds']) + if timeout_flag or seg_flag : start_time_transcription = time.time() # if client_configs[client_id]['language'] is not None: @@ -132,58 +139,66 @@ async def transcribe_and_send(client_id, websocket, new_audio_data): transcription_time = time.time() - start_time_transcription - if DEBUG: print(f"Transcription Time: {transcription_time:.2f} seconds") + logger.info(f"Transcription Time: {transcription_time:.2f} seconds") - print(f"Client ID {client_id}: Transcribed : {result['text']}") + logger.info(f"Client ID {client_id}: Transcribed : {result['text']}") if result['text']: - await websocket.send(result['text']) + + time_delta = time.time() - recv_time[client_id] + time_delta_str = f"|{time_delta:.3f}s|" + sep_text = time_delta_str if seg_flag else f"......{time_delta_str}" + await websocket.send(result['text'] + sep_text) client_temp_buffers[client_id].clear() # Clear temp buffer after processing else: client_temp_buffers[client_id].clear() client_temp_buffers[client_id].extend(audio_data) - if DEBUG: print(f"Skipping because {last_segment.end} falls after {(len(audio_data) / (SAMPLES_WIDTH * SAMPLING_RATE)) - int(client_configs[client_id]['chunk_offset_seconds'])}") + logger.info(f"Skipping because {last_segment.end} falls after {(len(audio_data) / (SAMPLES_WIDTH * SAMPLING_RATE)) - float(client_configs[client_id]['chunk_offset_seconds'])}") os.remove(file_name) # in the end always delete the created file async def receive_audio(websocket, path): - print(f"websocket type: {websocket}") + logger.info(f"websocket type: {websocket}") client_id = str(uuid.uuid4()) connected_clients[client_id] = websocket client_buffers[client_id] = bytearray() + recv_time[client_id] = None # recv time list client_configs[client_id] = DEFAULT_CLIENT_CONFIG - print(f"Client {client_id} connected") + logger.info(f"Client {client_id} connected") + try: async for message in websocket: if isinstance(message, bytes): client_buffers[client_id].extend(message) + recv_time[client_id] = time.time() elif isinstance(message, str): config = json.loads(message) if config.get('type') == 'config': client_configs[client_id] = config['data'] - print(f"Config for {client_id}: {client_configs[client_id]}") + logger.info(f"Config for {client_id}: {client_configs[client_id]}") continue else: - print(f"Unexpected message type from {client_id}") + logger.info(f"Unexpected message type from {client_id}") # Process audio when enough data is received - config_buf_size = int(client_configs[client_id]['chunk_length_seconds']) * SAMPLING_RATE * SAMPLES_WIDTH + config_buf_size = float(client_configs[client_id]['chunk_length_seconds']) * SAMPLING_RATE * SAMPLES_WIDTH if len(client_buffers[client_id]) > config_buf_size: - if DEBUG: print(f"Client ID {client_id}: receive_audio calling transcribe_and_send with length: {len(client_buffers[client_id])}, max length: {config_buf_size}") + logger.info(f"Client ID {client_id}: receive_audio calling transcribe_and_send with length: {len(client_buffers[client_id])}, max length: {config_buf_size}") await transcribe_and_send(client_id, websocket, client_buffers[client_id]) client_buffers[client_id].clear() + recv_time[client_id] = list() except websockets.ConnectionClosed as e: - print(f"Connection with {client_id} closed: {e}") + logger.info(f"Connection with {client_id} closed: {e}") finally: del connected_clients[client_id] del client_buffers[client_id] async def main(): async with websockets.serve(receive_audio, HOST, PORT): - print(f"WebSocket server started on ws://{HOST}:{PORT}") + logger.info(f"WebSocket server started on ws://{HOST}:{PORT}") await asyncio.Future() if __name__ == "__main__": diff --git a/utils/log.py b/utils/log.py new file mode 100644 index 0000000..4260590 --- /dev/null +++ b/utils/log.py @@ -0,0 +1,36 @@ +import logging + +def configure_logging(): + # Create a logger + logger = logging.getLogger(__name__) + + # Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + logger.setLevel(logging.INFO) + + # Create a formatter + formatter = logging.Formatter('%(asctime)s - %(filename)s:%(lineno)s - %(levelname)s - %(message)s') + + # Create a console handler and set the formatter + console_handler = logging.StreamHandler() + console_handler.setFormatter(formatter) + + # Add the console handler to the logger + logger.addHandler(console_handler) + + # Optionally, add a file handler to log to a file + # file_handler = logging.FileHandler('logfile.log') + # file_handler.setFormatter(formatter) + # logger.addHandler(file_handler) + + return logger + +if __name__ == '__main__': + # Configure logging + logger = configure_logging() + + # Example log messages + logger.debug('This is a debug message') + logger.info('This is an info message') + logger.warning('This is a warning message') + logger.error('This is an error message') + logger.critical('This is a critical message') From cb72dfafdcd59026ade41b6c173ae75f7b682898 Mon Sep 17 00:00:00 2001 From: Xu Wenhao Date: Fri, 5 Jan 2024 12:16:41 +0000 Subject: [PATCH 08/12] use a random port and do not bind port in the Dockerfile or devcontainer.json --- .devcontainer/Dockerfile | 3 --- .devcontainer/devcontainer.json | 2 +- .devcontainer/docker-compose.yml | 13 +++++++------ server.py | 3 ++- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index e485bfe..a3e9b2c 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,8 +1,5 @@ FROM ghcr.io/shoppal-ai/llm-base -ARG HTTP_PROXY -ARG HTTPS_PROXY - RUN apt update && apt install -y zsh curl git sudo wget libsndfile1 ffmpeg # # install python environment diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 64ea849..e52b57f 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -39,7 +39,7 @@ // Use 'forwardPorts' to make a list of ports inside the container available locally. // This can be used to network with other containers or the host. - "forwardPorts": [9876], + // "forwardPorts": [8080], // Run Args to use GPU // "runArgs": ["--gpus", "all"], diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 0b406d2..add0c59 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -5,18 +5,19 @@ services: build: context: .. args: - HTTP_PROXY: http://10.232.14.15:8118 - HTTPS_PROXY: http://10.232.14.15:8118 + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} dockerfile: .devcontainer/Dockerfile volumes: - ~/VoiceStreamAI:/workspace:cached - /data0:/data0:cached - ports: - - "9876:9876" user: vscode command: sleep infinity - deploy: + deploy: resources: reservations: devices: - - capabilities: [gpu] \ No newline at end of file + - capabilities: [gpu] + environment: + - http_proxy=${http_proxy} + - https_proxy=${https_proxy} \ No newline at end of file diff --git a/server.py b/server.py index 7d720ce..8194df7 100644 --- a/server.py +++ b/server.py @@ -15,6 +15,7 @@ import torch import logging import time +import random from transformers import pipeline from pyannote.core import Segment from pyannote.audio import Model @@ -24,7 +25,7 @@ HOST = '0.0.0.0' -PORT = 9876 +PORT = os.environ.get("SERVER_PORT", random.randint(10000, 11000)) SAMPLING_RATE = 16000 AUDIO_CHANNELS = 1 SAMPLES_WIDTH = 2 # int16 From 5a0a40bb17716aee373246fa5d1c18e209d7a3f2 Mon Sep 17 00:00:00 2001 From: wangji Date: Sun, 7 Jan 2024 15:15:25 +0000 Subject: [PATCH 09/12] stream by vad stop --- .devcontainer/docker-compose.yml | 4 +- requirements-dev.txt | 3 + server.py | 212 ++++++++++++++----------------- utils/llm.py | 18 +++ 4 files changed, 115 insertions(+), 122 deletions(-) create mode 100644 utils/llm.py diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 0b406d2..cd108b9 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -11,8 +11,8 @@ services: volumes: - ~/VoiceStreamAI:/workspace:cached - /data0:/data0:cached - ports: - - "9876:9876" + # ports: + # - "9876:9876" user: vscode command: sleep infinity deploy: diff --git a/requirements-dev.txt b/requirements-dev.txt index 4788a7f..cad3105 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,3 +4,6 @@ pyannote.audio torchvision transformers websockets +jupyter +datasets +openai \ No newline at end of file diff --git a/server.py b/server.py index 7d720ce..896f8c1 100644 --- a/server.py +++ b/server.py @@ -1,10 +1,3 @@ -""" -VoiceStreamAI Server: Real-time audio transcription using self-hosted Whisper and WebSocket - -Contributors: -- Alessandro Saccoia - alessandro.saccoia@gmail.com -""" - import asyncio import websockets import uuid @@ -14,159 +7,128 @@ import time import torch import logging +import sys import time from transformers import pipeline from pyannote.core import Segment from pyannote.audio import Model from pyannote.audio.pipelines import VoiceActivityDetection from utils.log import configure_logging +import numpy as np +import io +from utils.llm import chat +import soundfile as sf + logger = configure_logging() -HOST = '0.0.0.0' +HOST = "0.0.0.0" PORT = 9876 SAMPLING_RATE = 16000 AUDIO_CHANNELS = 1 -SAMPLES_WIDTH = 2 # int16 -DEBUG = True -VAD_AUTH_TOKEN = os.environ.get("HF_TOKEN") # get your key here -> https://huggingface.co/pyannote/segmentation +SAMPLES_WIDTH = 2 # int16 +VAD_AUTH_TOKEN = os.environ.get( + "HF_TOKEN" +) # get your key here -> https://huggingface.co/pyannote/segmentation DEFAULT_CLIENT_CONFIG = { - "language" : None, # multilingual - "chunk_length_seconds" : 5, - "chunk_offset_seconds" : 1 + "language": None, # multilingual + "chunk_length_seconds": 2, + "chunk_offset_seconds": 0.5, } - -audio_dir = "audio_files" -os.makedirs(audio_dir, exist_ok=True) device = torch.device("cuda", 1) ## ---------- INSTANTIATES VAD -------- model = Model.from_pretrained("pyannote/segmentation", use_auth_token=VAD_AUTH_TOKEN) vad_pipeline = VoiceActivityDetection(segmentation=model, device=device) -vad_pipeline.instantiate({"onset": 0.5, "offset": 0.5, "min_duration_on": 0.3, "min_duration_off": 0.3}) +vad_pipeline.instantiate( + {"onset": 0.5, "offset": 0.5, "min_duration_on": 0.3, "min_duration_off": 0.3} +) ## ---------- INSTANTIATES SPEECH -------- -#recognition_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3") -recognition_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-medium.en", device=device) +# recognition_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3") +recognition_pipeline = pipeline( + "automatic-speech-recognition", model="openai/whisper-medium.en", device=device +) connected_clients = {} client_buffers = {} client_temp_buffers = {} client_configs = {} -# Counter for each client to keep track of file numbers -file_counters = {} recv_time = {} +file_count = 0 +async def transcribe_and_send(client_id, websocket): + global file_count + if client_id in client_temp_buffers: + client_temp_buffers[client_id] = client_temp_buffers[client_id] + client_buffers[client_id] + else: + client_temp_buffers[client_id] = client_buffers[client_id] + cur_data = client_temp_buffers[client_id] + duration = float(len(cur_data)) / (SAMPLES_WIDTH * SAMPLING_RATE) -async def transcribe_and_send(client_id, websocket, new_audio_data): - global file_counters - - logger.info(f"Client ID {client_id}: new_audio_data length in seconds at transcribe_and_send: {float(len(new_audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}") - - # Initialize temporary buffer for new clients - if client_id not in client_temp_buffers: - client_temp_buffers[client_id] = bytearray() - - logger.info(f"Client ID {client_id}: client_temp_buffers[client_id] length in seconds at transcribe_and_send: {float(len(client_temp_buffers[client_id])) / float(SAMPLING_RATE * SAMPLES_WIDTH)}") - - # Add new audio data to the temporary buffer - old_audio_data = bytes(client_temp_buffers[client_id]) - - logger.info(f"Client ID {client_id}: old_audio_data length in seconds at transcribe_and_send: {float(len(old_audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}") - - - audio_data = old_audio_data + new_audio_data - - logger.info(f"Client ID {client_id}: audio_data length in seconds at transcribe_and_send: {float(len(audio_data)) / float(SAMPLING_RATE * SAMPLES_WIDTH)}") - - # Initialize file counter for new clients - if client_id not in file_counters: - file_counters[client_id] = 0 - - # File path - file_name = f"{audio_dir}/{client_id}_{file_counters[client_id]}.wav" - - logger.info(f"Client ID {client_id}: Filename : {file_name}") - - file_counters[client_id] += 1 - - # Save the audio data - with wave.open(file_name, 'wb') as wav_file: - wav_file.setnchannels(AUDIO_CHANNELS) - wav_file.setsampwidth(SAMPLES_WIDTH) - wav_file.setframerate(SAMPLING_RATE) - wav_file.writeframes(audio_data) - - # Measure VAD time + # vad inference + numpy_audio = np.frombuffer(cur_data, dtype=np.int16) + tensor_audio = torch.tensor(numpy_audio, dtype=torch.float32).view(1, -1) start_time_vad = time.time() - result = vad_pipeline(file_name) + vad_result = vad_pipeline({"waveform":tensor_audio, "sample_rate":SAMPLING_RATE}) vad_time = time.time() - start_time_vad + logger.info(f"Client ID {client_id}: VAD infer time:{vad_time:.2f}, VAD segments: {len(vad_result)}, current audio length: {duration:.2f}s") - # Logging after VAD - logger.info(f"Client ID {client_id}: VAD result segments count: {len(result)}") - logger.info(f"Client ID {client_id}: VAD inference time: {vad_time:.2f}") - - if len(result) == 0: # this should happen just if there's no old audio data - os.remove(file_name) - client_temp_buffers[client_id].clear() + if len(vad_result) == 0: + logger.info("drop this segment due to no voice activity found") + client_temp_buffers[client_id]= bytearray() return - - - # Get last recognized segment - last_segment = None - for segment in result.itersegments(): - last_segment = segment - - logger.info(f"Client ID {client_id}: VAD last Segment end : {last_segment.end}") - - accumulated_secs = len(audio_data) / (SAMPLES_WIDTH * SAMPLING_RATE) - # if the voice ends before chunk_offset_seconds process it all - timeout_flag = accumulated_secs > 5 - seg_flag = last_segment.end < accumulated_secs - float(client_configs[client_id]['chunk_offset_seconds']) - if timeout_flag or seg_flag : - start_time_transcription = time.time() - - # if client_configs[client_id]['language'] is not None: - # result = recognition_pipeline(file_name, generate_kwargs={"language": client_configs[client_id]['language']}) + end = 0 + for segment in vad_result.itersegments(): + # if segment.start - end > client_configs[client_id]['chunk_offset_seconds']: + # # ASR pipeline + # cut_point = int(end * (SAMPLES_WIDTH * SAMPLING_RATE)) + # cur_numpy = np.frombuffer(cur_data[:cut_point], dtype=np.int16) + # asr_result = recognition_pipeline(cur_numpy) + # client_buffers[client_id] = client_buffers[client_id][cut_point:] + # if asr_result["text"]: + # question = asr_result['text'] + # answer = chat(question) + # await websocket.send(f"Q: {question} A: {answer}") + # return # else: - result = recognition_pipeline(file_name) - - transcription_time = time.time() - start_time_transcription - - logger.info(f"Transcription Time: {transcription_time:.2f} seconds") - - logger.info(f"Client ID {client_id}: Transcribed : {result['text']}") - - if result['text']: - - time_delta = time.time() - recv_time[client_id] - time_delta_str = f"|{time_delta:.3f}s|" - sep_text = time_delta_str if seg_flag else f"......{time_delta_str}" - await websocket.send(result['text'] + sep_text) - client_temp_buffers[client_id].clear() # Clear temp buffer after processing - else: - client_temp_buffers[client_id].clear() - client_temp_buffers[client_id].extend(audio_data) - logger.info(f"Skipping because {last_segment.end} falls after {(len(audio_data) / (SAMPLES_WIDTH * SAMPLING_RATE)) - float(client_configs[client_id]['chunk_offset_seconds'])}") + end = segment.end + if duration - end > client_configs[client_id]['chunk_offset_seconds']: + cut_point = int(end * SAMPLING_RATE) * SAMPLES_WIDTH + logger.info(f"buffer size: {len(cur_data)}, cut_point: {cut_point}") + cur_numpy = np.frombuffer(cur_data[:cut_point], dtype=np.int16) + asr_result = recognition_pipeline(cur_numpy) + client_temp_buffers[client_id] = cur_data[cut_point:] + if asr_result["text"]: + file_count += 1 + question = asr_result['text'] + file_name = os.path.join('audio_files', f"{question}_{file_count}.wav") + with wave.open(file_name, 'wb') as wav_file: + wav_file.setnchannels(AUDIO_CHANNELS) + wav_file.setsampwidth(SAMPLES_WIDTH) + wav_file.setframerate(SAMPLING_RATE) + wav_file.writeframes(cur_data[:cut_point]) + answer = chat(question) + await websocket.send(f"Q: {question} A: {answer}") + return + - os.remove(file_name) # in the end always delete the created file async def receive_audio(websocket, path): logger.info(f"websocket type: {websocket}") client_id = str(uuid.uuid4()) connected_clients[client_id] = websocket client_buffers[client_id] = bytearray() - recv_time[client_id] = None # recv time list + recv_time[client_id] = None # recv time list client_configs[client_id] = DEFAULT_CLIENT_CONFIG - + logger.info(f"Client {client_id} connected") - try: async for message in websocket: @@ -174,21 +136,29 @@ async def receive_audio(websocket, path): client_buffers[client_id].extend(message) recv_time[client_id] = time.time() elif isinstance(message, str): - config = json.loads(message) - if config.get('type') == 'config': - client_configs[client_id] = config['data'] - logger.info(f"Config for {client_id}: {client_configs[client_id]}") - continue + # config = json.loads(message) + # if config.get("type") == "config": + # client_configs[client_id] = config["data"] + # logger.info(f"Config for {client_id}: {client_configs[client_id]}") + continue else: logger.info(f"Unexpected message type from {client_id}") # Process audio when enough data is received - config_buf_size = float(client_configs[client_id]['chunk_length_seconds']) * SAMPLING_RATE * SAMPLES_WIDTH + config_buf_size = ( + float(client_configs[client_id]["chunk_length_seconds"]) + * SAMPLING_RATE + * SAMPLES_WIDTH + ) if len(client_buffers[client_id]) > config_buf_size: - logger.info(f"Client ID {client_id}: receive_audio calling transcribe_and_send with length: {len(client_buffers[client_id])}, max length: {config_buf_size}") - await transcribe_and_send(client_id, websocket, client_buffers[client_id]) + logger.info( + f"Client ID {client_id}: receive_audio calling transcribe_and_send with length: {len(client_buffers[client_id])}, max length: {config_buf_size}" + ) + await transcribe_and_send( + client_id, websocket + ) client_buffers[client_id].clear() - recv_time[client_id] = list() + recv_time[client_id] = None except websockets.ConnectionClosed as e: logger.info(f"Connection with {client_id} closed: {e}") @@ -196,10 +166,12 @@ async def receive_audio(websocket, path): del connected_clients[client_id] del client_buffers[client_id] + async def main(): async with websockets.serve(receive_audio, HOST, PORT): logger.info(f"WebSocket server started on ws://{HOST}:{PORT}") await asyncio.Future() + if __name__ == "__main__": asyncio.run(main()) diff --git a/utils/llm.py b/utils/llm.py new file mode 100644 index 0000000..866e387 --- /dev/null +++ b/utils/llm.py @@ -0,0 +1,18 @@ +import openai +import os + + +client = openai.Client(api_key="fake_key", base_url="http://vllm:8000/v1/") + +def chat(text): + return f"what do you mean by {text}" + + # response = client.chat.completions.create( + # model="/data0/model_output/shoppal-test/dreampal", + # messages= [{"role": "system", "content": "You are now a dream interpretation expert. Please analyze the description of the dream that I input."}, + # {"role": "user", "content": text }], + # # response_format={ "type": "json_object" }, + # stream=False + # ) + + # return response \ No newline at end of file From 99ede15e70676d7ed546e5b401b3d3dd21330b2c Mon Sep 17 00:00:00 2001 From: wangji Date: Mon, 8 Jan 2024 03:13:35 +0000 Subject: [PATCH 10/12] build with proxy --- .devcontainer/.env | 4 ++++ .devcontainer/docker-compose.yml | 4 ++-- .env.example | 4 ---- .gitignore | 1 - 4 files changed, 6 insertions(+), 7 deletions(-) create mode 100644 .devcontainer/.env delete mode 100644 .env.example diff --git a/.devcontainer/.env b/.devcontainer/.env new file mode 100644 index 0000000..4ef9382 --- /dev/null +++ b/.devcontainer/.env @@ -0,0 +1,4 @@ +http_proxy=http://10.232.14.15:8118 +https_proxy=http://10.232.14.15:8118 +HF_ENDPOINT=https://hf-mirror.com +HF_TOKEN= \ No newline at end of file diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index add0c59..5fc7fe7 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -3,11 +3,11 @@ version: '3.8' services: app: build: - context: .. + context: . args: http_proxy: ${http_proxy} https_proxy: ${https_proxy} - dockerfile: .devcontainer/Dockerfile + dockerfile: Dockerfile volumes: - ~/VoiceStreamAI:/workspace:cached - /data0:/data0:cached diff --git a/.env.example b/.env.example deleted file mode 100644 index 136fcc3..0000000 --- a/.env.example +++ /dev/null @@ -1,4 +0,0 @@ -HTTP_PROXY= -HTTPS_PROXY= -HF_ENDPOINT=https://hf-mirror.com -HF_TOKEN= \ No newline at end of file diff --git a/.gitignore b/.gitignore index 0af100c..4da0aa2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ -.env .DS_STORE SHOPPAL_README.md audio_files/* From b9df2ba36b87200437297e5db55e029b30fba3f1 Mon Sep 17 00:00:00 2001 From: wangji Date: Mon, 8 Jan 2024 03:23:04 +0000 Subject: [PATCH 11/12] test --- test.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 test.py diff --git a/test.py b/test.py new file mode 100644 index 0000000..b32718f --- /dev/null +++ b/test.py @@ -0,0 +1,44 @@ +import openai + +import os + +api_key = "fake_key" +api_base = "http://vllm:8000/v1/" +client = openai.Client(api_key=api_key, base_url=api_base) + +model_name = "/data0/model_output/shoppal-test/dreampal" + +def predict(message, history, system_prompt): + history_openai_format = [] + history_openai_format.append({"role": "system", "content": system_prompt}) + for human, assistant in history: + history_openai_format.append({"role": "user", "content": human }) + history_openai_format.append({"role": "assistant", "content":assistant}) + history_openai_format.append({"role": "user", "content": message}) + + response = client.chat.completions.create( + model=model_name, + messages= history_openai_format, + # response_format={ "type": "json_object" }, + stream=True + ) + + partial_message = "" + for chunk in response: + if chunk.choices[0].delta.content and len(chunk.choices[0].delta.content) != 0: + partial_message = partial_message + chunk.choices[0].delta.content + yield partial_message + +system_prompt = """ +You are now a dream interpretation expert. Please analyze the description of the dream that I input. +""" + +response = client.chat.completions.create( + model=model_name, + messages= [{"role": "system", "content": system_prompt}, + {"role": "user", "content": "hello" }], + #sresponse_format={ "type": "json_object" }, + #stream=False + ) + +#print(response.choices[0].message) \ No newline at end of file From 7c2fddac1ad7cbebe91f50824168680bf7b9c856 Mon Sep 17 00:00:00 2001 From: wangji Date: Mon, 8 Jan 2024 07:27:12 +0000 Subject: [PATCH 12/12] add llm; --- server.py | 19 ++++++++++--------- utils/llm.py | 22 ++++++++++++---------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/server.py b/server.py index 231de44..1737bdf 100644 --- a/server.py +++ b/server.py @@ -34,8 +34,8 @@ DEFAULT_CLIENT_CONFIG = { "language": None, # multilingual - "chunk_length_seconds": 2, - "chunk_offset_seconds": 0.5, + "chunk_length_seconds": 5, + "chunk_offset_seconds": 1, } @@ -51,7 +51,7 @@ ## ---------- INSTANTIATES SPEECH -------- # recognition_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3") recognition_pipeline = pipeline( - "automatic-speech-recognition", model="openai/whisper-medium.en", device=device + "automatic-speech-recognition", model="openai/whisper-large-v2", device=device ) @@ -104,29 +104,30 @@ async def transcribe_and_send(client_id, websocket): cut_point = int(end * SAMPLING_RATE) * SAMPLES_WIDTH logger.info(f"buffer size: {len(cur_data)}, cut_point: {cut_point}") cur_numpy = np.frombuffer(cur_data[:cut_point], dtype=np.int16) - asr_result = recognition_pipeline(cur_numpy) + asr_result = recognition_pipeline({"sampling_rate":16000, "raw":cur_numpy}) client_temp_buffers[client_id] = cur_data[cut_point:] if asr_result["text"]: file_count += 1 - question = asr_result['text'] + question = asr_result['text'] + f"...|{time.time()-recv_time[client_id]:.3f}s|" + await websocket.send(question) file_name = os.path.join('audio_files', f"{question}_{file_count}.wav") with wave.open(file_name, 'wb') as wav_file: wav_file.setnchannels(AUDIO_CHANNELS) wav_file.setsampwidth(SAMPLES_WIDTH) wav_file.setframerate(SAMPLING_RATE) wav_file.writeframes(cur_data[:cut_point]) - answer = chat(question) - await websocket.send(f"Q: {question} A: {answer}") + answer = chat(asr_result['text']) + f"...|{time.time()-recv_time[client_id]:.3f}s|" + await websocket.send(answer) return async def receive_audio(websocket, path): + global recv_time logger.info(f"websocket type: {websocket}") client_id = str(uuid.uuid4()) connected_clients[client_id] = websocket client_buffers[client_id] = bytearray() - recv_time[client_id] = None # recv time list client_configs[client_id] = DEFAULT_CLIENT_CONFIG logger.info(f"Client {client_id} connected") @@ -159,7 +160,7 @@ async def receive_audio(websocket, path): client_id, websocket ) client_buffers[client_id].clear() - recv_time[client_id] = None + except websockets.ConnectionClosed as e: logger.info(f"Connection with {client_id} closed: {e}") diff --git a/utils/llm.py b/utils/llm.py index 866e387..441075f 100644 --- a/utils/llm.py +++ b/utils/llm.py @@ -2,17 +2,19 @@ import os -client = openai.Client(api_key="fake_key", base_url="http://vllm:8000/v1/") +client = openai.Client(api_key="fake_key", base_url="http://10.232.14.16:8000/v1/") def chat(text): - return f"what do you mean by {text}" + response = client.chat.completions.create( + model="/data0/models/huggingface/meta-llama/Llama-2-7b-chat-hf/", + messages= [{"role": "system", "content": "you are a usefull agent and try to answer each question within 15 words"}, + {"role": "user", "content": text }], + # response_format={ "type": "json_object" }, + #stream=False + ) - # response = client.chat.completions.create( - # model="/data0/model_output/shoppal-test/dreampal", - # messages= [{"role": "system", "content": "You are now a dream interpretation expert. Please analyze the description of the dream that I input."}, - # {"role": "user", "content": text }], - # # response_format={ "type": "json_object" }, - # stream=False - # ) + return response.choices[0].message.content - # return response \ No newline at end of file +if __name__ == '__main__': + ret = chat("hello") + print(ret) \ No newline at end of file