
(图片来源网络,侵删)
HighLight

(图片来源网络,侵删)
部署ChatGLM3-6B并开启HTTP server能力
下载embedding模型bge-large-zh-v1.5
HTTP接口问答示例
LLM讲了个尴尬的笑话~
HighLight
将LLM服务化(如提供HTTP server能力),才能在其上构建自己的应用。
部署ChatGLM3-6B并开启HTTP server能力
下载embedding模型bge-large-zh-v1.5
启动模型需要
https://www.modelscope.cn/models/Xorbits/bge-large-zh-v1.5/files
# set LLM path 修改为自己的路径
MODEL_PATH = os.environ.get('MODEL_PATH', 'D:\\Github\\chatglm3-6b')
TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
# embedding model修改为自己的路径
EMBEDDING_PATH = os.environ.get('EMBEDDING_PATH', "D:\\github\\bge-large-zh-v1.5")
参考ChatGLM官方提供的demo
openai_api_demo/api_server.py
import os import time import tiktoken import torch import uvicorn from fastapi import FastAPI, HTTPException, Response from fastapi.middleware.cors import CORSMiddleware from contextlib import asynccontextmanager from typing import List, Literal, Optional, Union from loguru import logger from pydantic import BaseModel, Field from transformers import AutoTokenizer, AutoModel from utils import process_response, generate_chatglm3, generate_stream_chatglm3 from sentence_transformers import SentenceTransformer from sse_starlette.sse import EventSourceResponse # Set up limit request time EventSourceResponse.DEFAULT_PING_INTERVAL = 1000 # set LLM path MODEL_PATH = os.environ.get('MODEL_PATH', 'D:\\github\\chatglm3-6b') TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH) # set Embedding Model path EMBEDDING_PATH = os.environ.get('EMBEDDING_PATH', "D:\\github\\bge-large-zh-v1.5") @asynccontextmanager async def lifespan(app: FastAPI): yield if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.ipc_collect() app = FastAPI(lifespan=lifespan) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) class ModelCard(BaseModel): id: str object: str = "model" created: int = Field(default_factory=lambda: int(time.time())) owned_by: str = "owner" root: Optional[str] = None parent: Optional[str] = None permission: Optional[list] = None class ModelList(BaseModel): object: str = "list" data: List[ModelCard] = [] class FunctionCallResponse(BaseModel): name: Optional[str] = None arguments: Optional[str] = None class ChatMessage(BaseModel): role: Literal["user", "assistant", "system", "function"] content: str = None name: Optional[str] = None function_call: Optional[FunctionCallResponse] = None class DeltaMessage(BaseModel): role: Optional[Literal["user", "assistant", "system"]] = None content: Optional[str] = None function_call: Optional[FunctionCallResponse] = None ## for Embedding class EmbeddingRequest(BaseModel): input: List[str] model: str class CompletionUsage(BaseModel): prompt_tokens: int completion_tokens: int total_tokens: int class EmbeddingResponse(BaseModel): data: list model: str object: str usage: CompletionUsage # for ChatCompletionRequest class UsageInfo(BaseModel): prompt_tokens: int = 0 total_tokens: int = 0 completion_tokens: Optional[int] = 0 class ChatCompletionRequest(BaseModel): model: str messages: List[ChatMessage] temperature: Optional[float] = 0.8 top_p: Optional[float] = 0.8 max_tokens: Optional[int] = None stream: Optional[bool] = False tools: Optional[Union[dict, List[dict]]] = None repetition_penalty: Optional[float] = 1.1 class ChatCompletionResponseChoice(BaseModel): index: int message: ChatMessage finish_reason: Literal["stop", "length", "function_call"] class ChatCompletionResponseStreamChoice(BaseModel): delta: DeltaMessage finish_reason: Optional[Literal["stop", "length", "function_call"]] index: int class ChatCompletionResponse(BaseModel): model: str id: str object: Literal["chat.completion", "chat.completion.chunk"] choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]] created: Optional[int] = Field(default_factory=lambda: int(time.time())) usage: Optional[UsageInfo] = None @app.get("/health") async def health() -> Response: """Health check.""" return Response(status_code=200) @app.post("/v1/embeddings", response_model=EmbeddingResponse) async def get_embeddings(request: EmbeddingRequest): embeddings = [embedding_model.encode(text) for text in request.input] embeddings = [embedding.tolist() for embedding in embeddings] def num_tokens_from_string(string: str) -> int: """ Returns the number of tokens in a text string. use cl100k_base tokenizer """ encoding = tiktoken.get_encoding('cl100k_base') num_tokens = len(encoding.encode(string)) return num_tokens response = { "data": [ { "object": "embedding", "embedding": embedding, "index": index } for index, embedding in enumerate(embeddings) ], "model": request.model, "object": "list", "usage": CompletionUsage( prompt_tokens=sum(len(text.split()) for text in request.input), completion_tokens=0, total_tokens=sum(num_tokens_from_string(text) for text in request.input), ) } return response @app.get("/v1/models", response_model=ModelList) async def list_models(): model_card = ModelCard( id="chatglm3-6b" ) return ModelList( data=[model_card] ) @app.post("/v1/chat/completions", response_model=ChatCompletionResponse) async def create_chat_completion(request: ChatCompletionRequest): global model, tokenizer if len(request.messages) 7, # try to judge whether it is a function call according to the special function prefix if not is_function_call and len(output) > 7: # Determine whether a function is called is_function_call = contains_custom_function(output) if is_function_call: continue # Non-function call, direct stream output finish_reason = new_response["finish_reason"] # Send an empty string first to avoid truncation by subsequent next() operations. if not has_send_first_chunk: message = DeltaMessage( content="", role="assistant", function_call=None, ) choice_data = ChatCompletionResponseStreamChoice( index=0, delta=message, finish_reason=finish_reason ) chunk = ChatCompletionResponse( model=model_id, id="", choices=[choice_data], created=int(time.time()), object="chat.completion.chunk" ) yield "{}".format(chunk.model_dump_json(exclude_unset=True)) send_msg = delta_text if has_send_first_chunk else output has_send_first_chunk = True message = DeltaMessage( content=send_msg, role="assistant", function_call=None, ) choice_data = ChatCompletionResponseStreamChoice( index=0, delta=message, finish_reason=finish_reason ) chunk = ChatCompletionResponse( model=model_id, id="", choices=[choice_data], created=int(time.time()), object="chat.completion.chunk" ) yield "{}".format(chunk.model_dump_json(exclude_unset=True)) if is_function_call: yield output else: yield '[DONE]' async def parse_output_text(model_id: str, value: str): """ Directly output the text content of value :param model_id: :param value: :return: """ choice_data = ChatCompletionResponseStreamChoice( index=0, delta=DeltaMessage(role="assistant", content=value), finish_reason=None ) chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk") yield "{}".format(chunk.model_dump_json(exclude_unset=True)) choice_data = ChatCompletionResponseStreamChoice( index=0, delta=DeltaMessage(), finish_reason="stop" ) chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk") yield "{}".format(chunk.model_dump_json(exclude_unset=True)) yield '[DONE]' def contains_custom_function(value: str) -> bool: """ Determine whether 'function_call' according to a special function prefix. For example, the functions defined in "tools_using_demo/tool_register.py" are all "get_xxx" and start with "get_" [Note] This is not a rigorous judgment method, only for reference. :param value: :return: """ return value and 'get_' in value if __name__ == "__main__": # Load LLM tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True) model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True, device_map="auto").eval() # load Embedding embedding_model = SentenceTransformer(EMBEDDING_PATH, device="cuda") uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)
HTTP接口问答示例
curl -H "Content-Type: application/json" -X POST -d '{ "messages": [ { "role":"user", "content":"给我讲个笑话" } ], "model":"chatglm3-6b" }' http://localhost:8000/v1/chat/completions
HTTP/1.1 200 OK date: Sat, 16 Mar 2024 13:16:00 GMT server: uvicorn content-length: 611 content-type: application/json Connection: close { "model": "chatglm3-6b", "id": "", "object": "chat.completion", "choices": [ { "index": 0, "message": { "role": "assistant", "content": "好的,给您讲一个轻松的笑话:\n\n有一天,小明在公园里捡到一个神奇的灯笼。他捧着灯笼说了:“我希望我成为世界上最聪明的人!”突然,他变成了一个女人。\n\n这个笑话是在玩弄性别刻板印象,暗示女性比男性更聪明。希望这个笑话能带给您快乐!", "name": null, "function_call": null }, "finish_reason": "stop" } ], "created": 1710594964, "usage": { "prompt_tokens": 11, "total_tokens": 83, "completion_tokens": 72 } }
LLM讲了个尴尬的笑话~
若HTTPserver在处理请求的过程中出现failed to open nvrtc-builtins64_121.dll错误,请参考下文1解决。