-
Notifications
You must be signed in to change notification settings - Fork 0
Description
#web.py
import os
import sys
import torch
import time
import json
import gradio as gr
from typing import Tuple
from pathlib import Path
from llama import Llama, ModelArgs, Transformer, Tokenizer
import socket
def check_port(host, port):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
result = s.connect_ex((host, port))
s.close()
return result == 0
ckpt_dir = "/llama/llama2-13b-chat"
tokenizer_path = "/llama/llama2-13b-chat/tokenizer.model"
temperature = 0.6
top_p = 0.9
max_seq_len = 512
max_batch_size = 4
def load(
ckpt_dir: str,
tokenizer_path: str,
max_seq_len: int,
max_batch_size: int,
):
generator = Llama.build(
ckpt_dir=ckpt_dir,
tokenizer_path=tokenizer_path,
max_seq_len=max_seq_len,
max_batch_size=max_batch_size,
)
return generator
generator = load(
ckpt_dir, tokenizer_path, max_seq_len, max_batch_size
)
def process(prompt: str):
dialogs = [
[
{"role": "system", "content": "AI"},
{"role": "user", "content": prompt}
],
]
results = generator.chat_completion(
dialogs, # type: ignore
max_gen_len=256,
temperature=temperature,
top_p=top_p,
)
return str(results[0]["generation"]["content"])
if not check_port('localhost',5000):
demo = gr.Interface(
title="极简Llama2问答对话",
description="还没有做成连续对话,虽然它可以",
article="基于Llama2",
fn = process,
inputs = gr.Textbox(lines=10, placeholder="请输入。。。", label="用户输入"),
outputs = "text",
)
demo.launch(share=False,server_name="0.0.0.0", server_port=5000)
torchrun --nproc_per_node 2 web.py
torchrun 会运行两遍代码副本 每个副本保存一部分模型 现在对web 做了处理要么启一个要么启两个 , 但是如果只通过一个web调用推理就会导致只有一个副本在推理模型 然后就 会出现一直在推理可能是在等待另一个副本上的部分模型去推理。所以这种情况有没有好的解决方案呢。