Skip to content

torchrun 模型并行问题 #1

@19y

Description

@19y

#web.py
import os
import sys
import torch
import time
import json

import gradio as gr

from typing import Tuple
from pathlib import Path
from llama import Llama, ModelArgs, Transformer, Tokenizer
import socket

def check_port(host, port):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
result = s.connect_ex((host, port))
s.close()
return result == 0

ckpt_dir = "/llama/llama2-13b-chat"
tokenizer_path = "/llama/llama2-13b-chat/tokenizer.model"
temperature = 0.6
top_p = 0.9
max_seq_len = 512
max_batch_size = 4

def load(
ckpt_dir: str,
tokenizer_path: str,
max_seq_len: int,
max_batch_size: int,
):
generator = Llama.build(
ckpt_dir=ckpt_dir,
tokenizer_path=tokenizer_path,
max_seq_len=max_seq_len,
max_batch_size=max_batch_size,
)
return generator

generator = load(
ckpt_dir, tokenizer_path, max_seq_len, max_batch_size
)

def process(prompt: str):
dialogs = [
[
{"role": "system", "content": "AI"},
{"role": "user", "content": prompt}
],
]
results = generator.chat_completion(

    dialogs,  # type: ignore

    max_gen_len=256,

    temperature=temperature,

    top_p=top_p,
    
)
return str(results[0]["generation"]["content"])

if not check_port('localhost',5000):

demo = gr.Interface(
title="极简Llama2问答对话",
description="还没有做成连续对话,虽然它可以",
article="基于Llama2",
fn = process,
inputs = gr.Textbox(lines=10, placeholder="请输入。。。", label="用户输入"),
outputs = "text",

)
demo.launch(share=False,server_name="0.0.0.0", server_port=5000)

torchrun --nproc_per_node 2 web.py
torchrun 会运行两遍代码副本 每个副本保存一部分模型 现在对web 做了处理要么启一个要么启两个 , 但是如果只通过一个web调用推理就会导致只有一个副本在推理模型 然后就 会出现一直在推理可能是在等待另一个副本上的部分模型去推理。所以这种情况有没有好的解决方案呢。

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions