1. Local Model Setup
Required Repositories & Resources:
- ChatGLM3-6B code: https://github.com/THUDM/ChatGLM3
- FastGPT: https://github.com/labring/FastGPT
- Vector model M3E (deploy via Docker): https://huggingface.co/moka-ai/m3e-base
- Anaconda3 installer: https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/
Create Conda Environment
conda create -n glm3-local-demo python=3.10
conda activate glm3-local-demo
ChatGLM3-6B requires Python 3.10 or later.
Test Composite Demo
Navigate to the composite demo directory:
cd /ChatGLM3-main/composite_demo
Install dependencies (remove PyTorch from requirements.txt first and install separately):
pip install -r requirements.txt
Install Jupyter kernel for Code Interpreter:
ipython kernel install --name glm3-local-demo --user
Install PyTorch and TorchVision (match your CUDA version, e.g., CUDA 117):
pip install torch-2.0.0+cu117-cp310-cp310-win_amd64.whl
pip install torchvision-0.15.0+cu117-cp310-cp310-win_amd64.whl
Download wheels from https://download.pytorch.org/whl/ if neeeded.
Verify GPU Availability
import torch
from transformers import __version__ as transformers_version
import torchvision
print(f"PyTorch Version: {torch.__version__}")
print(f"Transformers Version: {transformers_version}")
print(f"TorchVision Version: {torchvision.__version__}")
if torch.cuda.is_available():
print(f"CUDA Version: {torch.version.cuda}")
print("GPU is available.")
else:
print("GPU is NOT available.")
Run Composite Demo
Modify client.py to point to your model dircetory, then launch:
streamlit run main.py
For GPUs with <12GB VRAM, use 8-bit or 4-bit quantization for faster responses.
Launch OpenAI-Compliant API
Navigate to the OpenAI API demo directory:
cd D:/.../ChatGLM3-main/openai_api_demo
Modify openai_api.py with your model path, then test using Postman:
Request Body:
{
"model": "chatglm3-6b",
"messages": [
{
"role": "user",
"content": "Hello",
"name": "test",
"function_call": {
"name": "sample_function",
"arguments": "{}"
}
}
],
"temperature": 0.8,
"top_p": 0.8,
"max_tokens": 1024,
"stream": false,
"functions": {},
"repetition_penalty": 1.1
}
Start the API server:
python openai_api.py
Modified OpenAI API Code
# coding=utf-8
# Implements OpenAI-compatible API for ChatGLM3-6B
# Usage: python openai_api.py
# Documentation: http://localhost:8000/docs
import os
import time
import json
from contextlib import asynccontextmanager
from typing import List, Literal, Optional, Union
import torch
from torch.cuda import get_device_properties
import uvicorn
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from loguru import logger
from pydantic import BaseModel, Field
from sse_starlette.sse import EventSourceResponse
from transformers import AutoTokenizer, AutoModel
from utils import process_response, generate_chatglm3, generate_stream_chatglm3
LOCAL_MODEL_DIR = os.environ.get('MODEL_PATH', 'THUDM_chatglm3-6b')
TOKENIZER_DIR = os.environ.get("TOKENIZER_PATH", LOCAL_MODEL_DIR)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
@asynccontextmanager
async def lifespan(app: FastAPI):
yield
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
app = FastAPI(lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class ModelInfo(BaseModel):
id: str
object: str = "model"
created: int = Field(default_factory=lambda: int(time.time()))
owned_by: str = "local"
root: Optional[str] = None
parent: Optional[str] = None
permission: Optional[list] = None
class ModelListResponse(BaseModel):
object: str = "list"
data: List[ModelInfo] = []
class FunctionResponse(BaseModel):
name: Optional[str] = None
arguments: Optional[str] = None
class Message(BaseModel):
role: Literal["user", "assistant", "system", "function"]
content: str = None
name: Optional[str] = None
function_call: Optional[FunctionResponse] = None
class DeltaMessage(BaseModel):
role: Optional[Literal["user", "assistant", "system"]] = None
content: Optional[str] = None
function_call: Optional[FunctionResponse] = None
class ChatCompletionRequest(BaseModel):
model: str
messages: List[Message]
temperature: Optional[float] = 0.8
top_p: Optional[float] = 0.8
max_tokens: Optional[int] = None
stream: Optional[bool] = False
functions: Optional[Union[dict, List[dict]]] = None
repetition_penalty: Optional[float] = 1.1
class ChatChoice(BaseModel):
index: int
message: Message
finish_reason: Literal["stop", "length", "function_call"]
class StreamChatChoice(BaseModel):
index: int
delta: DeltaMessage
finish_reason: Optional[Literal["stop", "length", "function_call"]]
class UsageStats(BaseModel):
prompt_tokens: int = 0
total_tokens: int = 0
completion_tokens: Optional[int] = 0
class ChatCompletionResponse(BaseModel):
model: str
object: Literal["chat.completion", "chat.completion.chunk"]
choices: List[Union[ChatChoice, StreamChatChoice]]
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
usage: Optional[UsageStats] = None
@app.get("/v1/models", response_model=ModelListResponse)
async def list_available_models():
model_info = ModelInfo(id="chatglm3-6b")
return ModelListResponse(data=[model_info])
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def create_chat_completion(request: ChatCompletionRequest):
if len(request.messages) < 1 or request.messages[-1].role == "assistant":
raise HTTPException(status_code=400, detail="Invalid request structure")
gen_config = dict(
messages=request.messages,
temperature=request.temperature,
top_p=request.top_p,
max_tokens=request.max_tokens or 1024,
echo=False,
stream=request.stream,
repetition_penalty=request.repetition_penalty,
functions=request.functions,
)
logger.debug(f"Request Parameters: {gen_config}")
if request.stream:
stream_generator = stream_predict(request.model, gen_config)
initial_output = next(stream_generator)
if not contains_function_call(initial_output):
return EventSourceResponse(stream_generator, media_type="text/event-stream")
logger.debug(f"First Token Output: {initial_output}")
function_call = None
if initial_output and request.functions:
try:
function_call = process_response(initial_output, use_tool=True)
except:
logger.warning("Failed to parse function call")
if isinstance(function_call, dict):
function_call = FunctionResponse(**function_call)
# In this demo, we didn't register any tools. Implement your own tool dispatch here.
tool_response = ""
if not gen_config.get("messages"):
gen_config["messages"] = []
gen_config["messages"].append(Message(
role="assistant",
content=initial_output,
))
gen_config["messages"].append(Message(
role="function",
name=function_call.name,
content=tool_response,
))
final_generator = predict_final(request.model, gen_config)
return EventSourceResponse(final_generator, media_type="text/event-stream")
else:
fallback_generator = parse_text_output(request.model, initial_output)
return EventSourceResponse(fallback_generator, media_type="text/event-stream")
response = generate_chatglm3(model, tokenizer, gen_config)
if response["text"].startswith("\n"):
response["text"] = response["text"][1:].strip()
usage = UsageStats()
function_call, finish_reason = None, "stop"
if request.functions:
try:
function_call = process_response(response["text"], use_tool=True)
except:
logger.warning("Failed to parse tool call")
if isinstance(function_call, dict):
finish_reason = "function_call"
function_call = FunctionResponse(**function_call)
assistant_msg = Message(
role="assistant",
content=response["text"],
function_call=function_call if isinstance(function_call, FunctionResponse) else None,
)
logger.debug(f"Assistant Response: {assistant_msg}")
chat_choice = ChatChoice(index=0, message=assistant_msg, finish_reason=finish_reason)
task_usage = UsageStats(**response["usage"])
usage.prompt_tokens += task_usage.prompt_tokens
usage.total_tokens += task_usage.total_tokens
usage.completion_tokens += task_usage.completion_tokens
return ChatCompletionResponse(
model=request.model,
choices=[chat_choice],
object="chat.completion",
usage=usage
)
async def predict_final(model_id: str, config: dict):
choice_data = StreamChatChoice(
index=0,
delta=DeltaMessage(role="assistant"),
finish_reason=None
)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield chunk.model_dump_json(exclude_unset=True)
previous_text = ""
for new_response in generate_stream_chatglm3(model, tokenizer, config):
decoded_text = new_response["text"]
delta_text = decoded_text[len(previous_text):]
previous_text = decoded_text
finish_reason = new_response["finish_reason"]
if len(delta_text) == 0 and finish_reason != "function_call":
continue
function_call = None
if finish_reason == "function_call":
try:
function_call = process_response(decoded_text, use_tool=True)
except:
logger.warning("Failed to parse tool call")
if isinstance(function_call, dict):
function_call = FunctionResponse(**function_call)
delta = DeltaMessage(
content=delta_text,
role="assistant",
function_call=function_call if isinstance(function_call, FunctionResponse) else None,
)
choice_data = StreamChatChoice(index=0, delta=delta, finish_reason=finish_reason)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield chunk.model_dump_json(exclude_unset=True)
choice_data = StreamChatChoice(index=0, delta=DeltaMessage(), finish_reason="stop")
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield chunk.model_dump_json(exclude_unset=True)
yield '[DONE]'
async def stream_predict(model_id, gen_config):
output = ""
is_function = False
sent_first = False
for new_response in generate_stream_chatglm3(model, tokenizer, gen_config):
decoded_text = new_response["text"]
delta_text = decoded_text[len(output):]
output = decoded_text
if not is_function and len(output) > 7:
is_function = contains_function_call(output)
if is_function:
continue
if not sent_first:
yield ChatCompletionResponse(
model=model_id,
choices=[StreamChatChoice(index=0, delta=DeltaMessage(content="", role="assistant"), finish_reason=None)],
object="chat.completion.chunk"
).model_dump_json(exclude_unset=True)
sent_first = True
send_text = delta_text if sent_first else output
yield ChatCompletionResponse(
model=model_id,
choices=[StreamChatChoice(index=0, delta=DeltaMessage(content=send_text, role="assistant"), finish_reason=new_response["finish_reason"])],
object="chat.completion.chunk"
).model_dump_json(exclude_unset=True)
if is_function:
yield output
else:
yield '[DONE]'
async def parse_text_output(model_id: str, value: str):
choice_data = StreamChatChoice(
index=0,
delta=DeltaMessage(role="assistant", content=value),
finish_reason=None
)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield chunk.model_dump_json(exclude_unset=True)
choice_data = StreamChatChoice(index=0, delta=DeltaMessage(), finish_reason="stop")
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield chunk.model_dump_json(exclude_unset=True)
yield '[DONE]'
def contains_function_call(value: str) -> bool:
return value and 'get_' in value
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
model = AutoModel.from_pretrained(LOCAL_MODEL_DIR, trust_remote_code=True)
if torch.cuda.is_available():
vram_gb = get_device_properties(0).total_memory / 1073741824
print(f'GPU VRAM: {vram_gb:.2f} GB')
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
if vram_gb > 13:
model = model.half().cuda()
print('Using FP16 precision (GPU)')
elif vram_gb > 10:
model = model.half().quantize(8).cuda()
print('Using INT8 quantization (GPU)')
elif vram_gb > 4.5:
model = model.half().quantize(4).cuda()
print('Using INT4 quantization (GPU)')
else:
model = model.float()
print('Using CPU')
else:
model = model.float()
print('Using CPU')
model = model.eval()
uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)
2. Deploy One-API
Prerequisites
- Ubuntu 20.04 with Docker and VirtualBox (VT-x/AMD-V enabled)
- Add user to Docker group for permission:
sudo usermod -aG docker $USER
Pull One-API Image
docker run --name one-api-server -d --restart always -p 13000:3000 -e TZ=Asia/Shanghai -v /home/ubuntu/data/one-api:/data justsong/one-api
Configure API Channels
- Access One-API at
http://localhost:13000(login: root/123456) - Add ChatGLM3-6B channel:
- Base URL:
http://localhost:8000/v1
- Base URL:
- Add M3E vector model channel:
- Base URL:
http://localhost:6200/v1
- Base URL:
- Generate a new API key and save it for FastGPT configuration.
Deploy M3E Vector Model
CPU-only deployment:
docker run -d -p 6200:6008 --name=m3e-embed-api registry.cn-hangzhou.aliyuncs.com/fastgpt_docker/m3e-large-api:latest
GPU-accelerated deployment:
docker run -d -p 6200:6008 --gpus all --name=m3e-embed-api registry.cn-hangzhou.aliyuncs.com/fastgpt_docker/m3e-large-api:latest
Test M3E Embedding
curl --location --request POST 'http://localhost:6200/v1/embeddings' \
--header 'Authorization: Bearer sk-your-api-key-here' \
--header 'Content-Type: application/json' \
--data-raw '{
"model": "m3e",
"input": ["What is LAF?"]
}'
A successful resposne will return an embedding vector array.
3. Deploy FastGPT
Download Configuration Files
curl -O https://raw.githubusercontent.com/labring/FastGPT/main/files/deploy/fastgpt/docker-compose.yml
curl -O https://raw.githubusercontent.com/labring/FastGPT/main/files/deploy/fastgpt/config.json
Modify Configuration
docker-compose.yml changes:
- Set
OPENAI_BASE_URLtohttp://localhost:13000/v1 - Set
CHAT_API_KEYto your One-API key
config.json changes (add ChatGLM3-6B and M3E models):
{
"systemEnv": {
"openapiPrefix": "fastgpt",
"vectorMaxProcess": 15,
"qaMaxProcess": 15,
"pgHNSWEfSearch": 100
},
"llmModels": [
{
"model": "chatglm3-6b",
"name": "ChatGLM3-6B",
"maxContext": 4000,
"maxResponse": 4000,
"quoteMaxToken": 2000,
"maxTemperature": 1,
"vision": false,
"defaultSystemChatPrompt": ""
},
{
"model": "gpt-3.5-turbo-16k",
"name": "GPT-3.5 Turbo 16K",
"maxContext": 16000,
"maxResponse": 16000,
"quoteMaxToken": 13000,
"maxTemperature": 1.2,
"inputPrice": 0,
"outputPrice": 0,
"censor": false,
"vision": false,
"datasetProcess": true,
"toolChoice": true,
"functionCall": false,
"defaultSystemChatPrompt": ""
}
],
"vectorModels": [
{
"model": "m3e",
"name": "M3E Embedding",
"price": 0.1,
"defaultToken": 500,
"maxToken": 1800
},
{
"model": "text-embedding-ada-002",
"name": "OpenAI Embedding",
"inputPrice": 0,
"outputPrice": 0,
"defaultToken": 700,
"maxToken": 3000,
"weight": 100
}
],
"reRankModels": [],
"audioSpeechModels": [],
"whisperModel": {}
}
Launch FastGPT
docker-compose pull
docker-compose up -d
Initialize MongoDB Replica Set (FastGPT ≥ 4.6.8)
# Check if MongoDB container is running
docker ps
# Enter container
docker exec -it mongo bash
# Connect to database
mongo -u your_username -p your_password --authenticationDatabase admin
# Initialize replica set
rs.initiate({
_id: "rs0",
members: [
{ _id: 0, host: "mongo:27017" }
]
})
# Verify status
rs.status()
Access FastGPT
Open http://localhost:3000 in your browser. Ensure the ChatGLM3-6B API server is running and One-API channels are correctly configured.