利用Modal免费部署LLM
本文以部署QwQ-32B_q4量化为例
modal每月提供$30的免费额度(现在需要添加支付方式,之前不用)(未添加支付方式只有$5额度)
进入Modal: High-performance AI infrastructure,使用GitHub账号注册
然后本地安装modal库(版本经常更新,可能会更改api,本文使用的版本是0.73.90)
pip install modal
在Modal网站进入个人设置,在API tokens处生成一个token,并在本地设置token
modal token set --token-id xxx --token-secret xxx
新建qwq.py:
from modal import Image, app, method, web_endpoint,App
IMAGE_MODEL_DIR = "/model"
import modal
from typing import Dict
def download_model():
from huggingface_hub import snapshot_download,hf_hub_download
hf_hub_download(repo_id="bartowski/Qwen_QwQ-32B-GGUF", filename="Qwen_QwQ-32B-Q4_K_M.gguf", local_dir=IMAGE_MODEL_DIR)
cuda_version = "12.4.0" # should be no greater than host CUDA version
flavor = "devel" # includes full CUDA toolkit
operating_sys = "ubuntu22.04"
tag = f"{cuda_version}-{flavor}-{operating_sys}"
image = (
modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
.apt_install("git")
.apt_install("gcc","build-essential","cmake","clang")
.pip_install("https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.77-cu124/llama_cpp_python-0.2.77-cp310-cp310-linux_x86_64.whl")
#.run_commands(
# "CMAKE_ARGS=\"-DGGML_CUDA=on\" pip install llama-cpp-python"
#)
.pip_install(
"einops==0.6.1",
"hf-transfer~=0.1",
"huggingface_hub==0.14.1",
"accelerate",
"colorama",
"cpm_kernels",
"sentencepiece",
"streamlit>=1.24.0",
"protobuf",
"sse-starlette",
"fastapi"
)
# Use huggingface's hi-perf hf-transfer library to download this large model.
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.run_function(download_model)
)
app = App(name="QwQ", image=image)
@app.cls(gpu="L4",timeout=1200, scaledown_window=60)
class qwq:
@modal.enter()
def e(self):
from llama_cpp import Llama
self.llm = Llama(model_path=IMAGE_MODEL_DIR+"/Qwen_QwQ-32B-Q4_K_M.gguf", n_ctx=8192,seed=-1,n_gpu_layers=-1)
#import subprocess
#output = subprocess.check_output(["nvidia-smi"], text=True)
#print(output)
@method()
def generate(self, req: str):
global flag,que
import time
from threading import Thread
from queue import Queue
import json
import os
import torch
import platform
from colorama import Fore, Style
print(req)
messages = json.loads(req)
def gen():
global flag,que
st = time.time()
for response in self.llm.create_chat_completion(messages,stop=["</s>"],stream=True,max_tokens=-1):
if "content" in response["choices"][0]["delta"]:
print(response["choices"][0]["delta"]["content"],end="")
#import subprocess
#output = subprocess.check_output(["nvidia-smi"], text=True)
#print(output)
flag = 1
que.put("data:"+str(response).replace('\'','\"').replace("None","\"None\"")+"\n\n")
if time.time()-st>1000:
break
que.put(None)
yield "data:"+str({"id":"chatcmpl-b32f3ee7-358b-4001-bb0a-44447a99c5d3","model":"/model/ggml-model-q4_0.bin","created":1691553316,"object":"chat.completion.chunk","choices":[{"index":0,"delta":{"content":""},"finish_reason":None}]}).replace('\'','\"').replace("None","\"None\"")+"\n\n"
flag = 0
que = Queue()
thread = Thread(target=gen)
thread.start()
while flag==0:
yield "data:"+str({"id":"chatcmpl-b32f3ee7-358b-4001-bb0a-44447a99c5d3","model":"/model/ggml-model-q4_0.bin","created":1691553316,"object":"chat.completion.chunk","choices":[{"index":0,"delta":{"content":""},"finish_reason":None}]}).replace('\'','\"').replace("None","\"None\"")+"\n\n"
time.sleep(1)
while True:
try:
i = que.get()
if i==None:
break
yield i
except:
pass
thread.join()
yield 'data:[DONE]\n\n'
@app.local_entrypoint()
def cli():
question = '[{"role": "user", "content": "你好"}]'
model = qwq()
for text in model.generate.remote(question):
print(text, end="", flush=True)
@app.function(timeout=1200)
@modal.fastapi_endpoint(method="POST")
def get(question: Dict):
from fastapi.responses import StreamingResponse
from itertools import chain
from fastapi.responses import JSONResponse
model = qwq()
return StreamingResponse(
model.generate.remote_gen(question["messages"]),
media_type="text/event-stream",
)
32B模型需要L4显卡才能以正常速度运行
可以修改scaledown_window的大小来调整无对话多长时间后关闭容器
因平台限制自写后端,比较繁琐
然后运行:
modal deploy qwq.py
此时去网站上找调用api的地址,然后可以按照openai的api格式进行调用
这里提供一个我已经搭建好的玩具:QwQ-32B
容器冷启动需要时间,所以当第一次对话或是60秒之内无对话需要等待容器启动加载模型