Overview
Large Language Models face several critical challenges in enterprise settings:
- Research costs: Running a 13B+ model requires 24GB+ VRAM for full quantization, making experimentation expensive
- Training overhead: Knowledge updates require complete retraining cycles
- Hallucination: Models generate plausible but incorrect responses when lacking domain-specific knowledge
A proven architecture addresses these issues by combining vector databases with LLMs. The workflow involves storing domain knowledge as embeddings in a vector database, retrieving relevant context during queries, and feeding this context to the LLM for accurate responses.
This guide demonstrates building a complete knowledge base system using:
- Web crawler for data acquisition
- Tencent Cloud VectorDB for storage
- ChatGLM3 for conversational AI
Part 1: Web Scraper Development
Environment Setup
python -m venv venv
source venv/bin/activate # Linux/Mac
# venv\Scripts\activate # Windows
Install dependencies:
pip install requests beautifulsoup4 lxml
Crawler Implementation
Create crawler.py with the following structure:
import requests
import json
import re
from bs4 import BeautifulSoup
BASE_URL = "https://cloud.tencent.com"
SEED_URL = f"{BASE_URL}/document/product/1709"
class DocumentCrawler:
def __init__(self):
self.url_queue = []
self.documents = []
def extract_navigation_urls(self, url):
"""Extract sidebar navigation links from the documentation site."""
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
nav_container = soup.select_one("textarea.J-qcSideNavListData")
if nav_container:
nav_data = json.loads(nav_container.get_text())["list"]
self._parse_nav_tree(nav_data)
def _parse_nav_tree(self, nodes):
"""Recursively parse navigation tree structure."""
for node in nodes:
link = f"{BASE_URL}{node['link']}"
self.url_queue.append({
"title": node["title"],
"link": link
})
if children := node.get("children"):
self._parse_nav_tree(children)
def extract_page_content(self, url, title):
"""Extract main content from individual documentation pages."""
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
content_container = soup.select_one("div.J-markdown-box")
if content_container:
text = content_container.get_text()[:20000]
cleaned_text = re.sub(r'\n+', '\n', text)
return {
"url": url,
"title": title,
"text": cleaned_text
}
return None
def run(self):
"""Execute the crawling process."""
self.extract_navigation_urls(SEED_URL)
for item in self.url_queue:
print(f"Crawling: {item['title']}")
doc = self.extract_page_content(item["link"], item["title"])
if doc:
self.documents.append(doc)
return self.documents
Part 2: Vector Data base Integration
Tencent Cloud VectorDB Setup
- Navigate to Tencent Cloud Console and search for "Vector Database"
- Select nearest region and create a free-tier instance
- Enable external network access in instance settings
- Configure IP whitelist (0.0.0.0/0 for testing)
- Retrieve connection credentials from the console
Databace Connection Module
Create vector_store.py:
import tcvectordb
from tcvectordb.model.collection import Embedding
from tcvectordb.model.document import Document
from tcvectordb.model.enum import (
FieldType, IndexType, MetricType,
EmbeddingModel, ReadConsistency
)
from tcvectordb.model.index import Index, VectorIndex, FilterIndex
class VectorStoreManager:
def __init__(self, endpoint: str, username: str, api_key: str):
self.client = tcvectordb.VectorDBClient(
url=endpoint,
username=username,
key=api_key,
read_consistency=ReadConsistency.EVENTUAL_CONSISTENCY,
timeout=30
)
def initialize_schema(self):
"""Create database and collection with proper indexing."""
db_name = "knowledge_base"
coll_name = "documentation"
db = self.client.create_database(db_name)
index_schema = Index()
index_schema.add(VectorIndex(
field_name='vector',
dimension=1024,
index_type=IndexType.HNSW,
metric_type=MetricType.COSINE,
params={'m': 16, 'ef_construction': 200}
))
index_schema.add(FilterIndex('id', FieldType.STRING, IndexType.PRIMARY_KEY))
index_schema.add(FilterIndex('title', FieldType.STRING, IndexType.FILTER))
embedding_config = Embedding(
vector_field='vector',
field='text',
model=EmbeddingModel.TEXT2VEC_LARGE_CHINESE
)
db.create_collection(
name=coll_name,
shard=3,
replicas=0,
description='Documentation knowledge base',
index=index_schema,
embedding=embedding_config,
timeout=50
)
def store_documents(self, documents: list):
"""Upsert documents into the vector store."""
db = self.client.database('knowledge_base')
collection = db.collection('documentation')
docs_to_insert = [
Document(
id=doc["url"],
text=doc["text"],
title=doc["title"]
)
for doc in documents
]
collection.upsert(documents=docs_to_insert, build_index=True)
print(f"Stored {len(docs_to_insert)} documents")
def semantic_search(self, query: str, limit: int = 3):
"""Perform semantic search using text query."""
db = self.client.database('knowledge_base')
collection = db.collection('documentation')
results = collection.searchByText(
embedding_items=[query],
limit=limit
)
return results.get('documents', [])
Important Configuration Notes
- The vector field should not have an index created separately—the vector index handles this
- Ensure the embedding model dimension matches the collection's vector dimension
- Dynamic schema support allows arbitrary fields beyond id and vector
Part 3: LLM Chat Interface
Dependencies
pip install transformers torch streamlit gradio sentencepiece accelerate
Chat Application
Create chat_app.py:
import os
import streamlit as st
import torch
from transformers import AutoModel, AutoTokenizer
MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b')
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
st.set_page_config(
page_title="AI Knowledge Assistant",
layout="wide"
)
@st.cache_resource
def load_model():
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
if DEVICE == 'cuda':
model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True)
model = model.half().to(DEVICE)
else:
model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True)
model = model.float().to(DEVICE)
model.eval()
return tokenizer, model
tokenizer, model = load_model()
if "conversation_history" not in st.session_state:
st.session_state.conversation_history = []
if "kv_cache" not in st.session_state:
st.session_state.kv_cache = None
with st.sidebar:
st.header("Parameters")
max_tokens = st.slider("Max Length", 0, 32768, 8192)
top_p = st.slider("Top P", 0.0, 1.0, 0.8)
temperature = st.slider("Temperature", 0.0, 1.0, 0.8)
if st.button("Clear History"):
st.session_state.conversation_history = []
st.session_state.kv_cache = None
if torch.cuda.is_available():
torch.cuda.empty_cache()
for msg in st.session_state.conversation_history:
role = "user" if msg["role"] == "user" else "assistant"
with st.chat_message(role):
st.markdown(msg["content"])
user_input = st.chat_input("Ask your question...")
if user_input:
with st.chat_message("user"):
st.markdown(user_input)
with st.chat_message("assistant"):
response_placeholder = st.empty()
current_history = []
for response, history, kv_cache in model.stream_chat(
tokenizer,
user_input,
current_history,
past_key_values=st.session_state.kv_cache,
max_length=max_tokens,
top_p=top_p,
temperature=temperature,
return_past_key_values=True
):
response_placeholder.markdown(response)
st.session_state.conversation_history = history
st.session_state.kv_cache = kv_cache
Part 4: Integrating Vector Search with LLM
Knowledge-Augmented Response Generation
Extend the chat application with vector search capability:
from vector_store import VectorStoreManager
VDB_CONFIG = {
'endpoint': 'http://your-instance.clb.region.tencentclb.com:50000',
'username': 'root',
'api_key': 'your-api-key'
}
vdb_manager = VectorStoreManager(**VDB_CONFIG)
def format_context(search_results):
"""Convert search results into a context string."""
context_parts = []
for doc_list in search_results:
for doc in doc_list:
context_parts.append(doc["text"])
return "\n".join(context_parts)[:20000]
def generate_augmented_prompt(user_query: str, context: str) -> str:
"""Create a prompt that includes retrieved knowledge."""
return f"Based on the following information, answer the question.\n\nContext:\n{context}\n\nQuestion: {user_query}"
with st.sidebar:
st.header("Conversation Mode")
mode = st.selectbox(
"Select Mode",
["Knowledge Base Q&A", "Standard Chat"],
key="mode_selector"
)
if user_input:
if mode == "Knowledge Base Q&A":
search_results = vdb_manager.semantic_search(user_input, limit=3)
context = format_context(search_results)
prompt = generate_augmented_prompt(user_input, context)
st.session_state.conversation_history = []
else:
prompt = user_input
with st.chat_message("assistant"):
response_placeholder = st.empty()
final_response = ""
for response, _, _ in model.stream_chat(
tokenizer,
prompt,
st.session_state.conversation_history if mode != "Knowledge Base Q&A" else [],
past_key_values=st.session_state.kv_cache if mode != "Knowledge Base Q&A" else None,
max_length=max_tokens,
top_p=top_p,
temperature=temperature,
return_past_key_values=True
):
final_response = response
response_placeholder.markdown(response)
if mode == "Knowledge Base Q&A":
references = "\n\n**References:**\n"
for doc_list in search_results:
for doc in doc_list:
references += f"- [{doc['title']}]({doc['id']})\n"
st.markdown(references)
else:
st.session_state.conversation_history = []
Architecture Summary
The system operates through three interconnected layers:
- Data Layer: Web crawler aggregates documentation content and stores it in Tencent Cloud VectorDB with semantic embeddings
- Retrieval Layer: Semantic search extracts relevant knowledge chunks based on user queries
- Generation Layer: LLM synthesizes retrieved context with natural language generation
This RAG (Retrieval-Augmented Generation) architecture significantly reduces hallucination by grounding responses in retrieved evidence while maintaining the conversational capabilities of modern LLMs.