Building an Enterprise Private Knowledge Base with Crawler, Vector Database, and LLM

Overview

Large Language Models face several critical challenges in enterprise settings:

Research costs: Running a 13B+ model requires 24GB+ VRAM for full quantization, making experimentation expensive
Training overhead: Knowledge updates require complete retraining cycles
Hallucination: Models generate plausible but incorrect responses when lacking domain-specific knowledge

A proven architecture addresses these issues by combining vector databases with LLMs. The workflow involves storing domain knowledge as embeddings in a vector database, retrieving relevant context during queries, and feeding this context to the LLM for accurate responses.

This guide demonstrates building a complete knowledge base system using:

Web crawler for data acquisition
Tencent Cloud VectorDB for storage
ChatGLM3 for conversational AI

Part 1: Web Scraper Development

Environment Setup

python -m venv venv
source venv/bin/activate  # Linux/Mac
# venv\Scripts\activate  # Windows

Install dependencies:

pip install requests beautifulsoup4 lxml

Crawler Implementation

Create crawler.py with the following structure:

import requests
import json
import re
from bs4 import BeautifulSoup

BASE_URL = "https://cloud.tencent.com"
SEED_URL = f"{BASE_URL}/document/product/1709"

class DocumentCrawler:
    def __init__(self):
        self.url_queue = []
        self.documents = []
    
    def extract_navigation_urls(self, url):
        """Extract sidebar navigation links from the documentation site."""
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        nav_container = soup.select_one("textarea.J-qcSideNavListData")
        
        if nav_container:
            nav_data = json.loads(nav_container.get_text())["list"]
            self._parse_nav_tree(nav_data)
    
    def _parse_nav_tree(self, nodes):
        """Recursively parse navigation tree structure."""
        for node in nodes:
            link = f"{BASE_URL}{node['link']}"
            self.url_queue.append({
                "title": node["title"],
                "link": link
            })
            if children := node.get("children"):
                self._parse_nav_tree(children)
    
    def extract_page_content(self, url, title):
        """Extract main content from individual documentation pages."""
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        content_container = soup.select_one("div.J-markdown-box")
        
        if content_container:
            text = content_container.get_text()[:20000]
            cleaned_text = re.sub(r'\n+', '\n', text)
            return {
                "url": url,
                "title": title,
                "text": cleaned_text
            }
        return None
    
    def run(self):
        """Execute the crawling process."""
        self.extract_navigation_urls(SEED_URL)
        
        for item in self.url_queue:
            print(f"Crawling: {item['title']}")
            doc = self.extract_page_content(item["link"], item["title"])
            if doc:
                self.documents.append(doc)
        
        return self.documents

Part 2: Vector Data base Integration

Tencent Cloud VectorDB Setup

Navigate to Tencent Cloud Console and search for "Vector Database"
Select nearest region and create a free-tier instance
Enable external network access in instance settings
Configure IP whitelist (0.0.0.0/0 for testing)
Retrieve connection credentials from the console

Databace Connection Module

Create vector_store.py:

import tcvectordb
from tcvectordb.model.collection import Embedding
from tcvectordb.model.document import Document
from tcvectordb.model.enum import (
    FieldType, IndexType, MetricType, 
    EmbeddingModel, ReadConsistency
)
from tcvectordb.model.index import Index, VectorIndex, FilterIndex

class VectorStoreManager:
    def __init__(self, endpoint: str, username: str, api_key: str):
        self.client = tcvectordb.VectorDBClient(
            url=endpoint,
            username=username,
            key=api_key,
            read_consistency=ReadConsistency.EVENTUAL_CONSISTENCY,
            timeout=30
        )
    
    def initialize_schema(self):
        """Create database and collection with proper indexing."""
        db_name = "knowledge_base"
        coll_name = "documentation"
        
        db = self.client.create_database(db_name)
        
        index_schema = Index()
        index_schema.add(VectorIndex(
            field_name='vector',
            dimension=1024,
            index_type=IndexType.HNSW,
            metric_type=MetricType.COSINE,
            params={'m': 16, 'ef_construction': 200}
        ))
        index_schema.add(FilterIndex('id', FieldType.STRING, IndexType.PRIMARY_KEY))
        index_schema.add(FilterIndex('title', FieldType.STRING, IndexType.FILTER))
        
        embedding_config = Embedding(
            vector_field='vector',
            field='text',
            model=EmbeddingModel.TEXT2VEC_LARGE_CHINESE
        )
        
        db.create_collection(
            name=coll_name,
            shard=3,
            replicas=0,
            description='Documentation knowledge base',
            index=index_schema,
            embedding=embedding_config,
            timeout=50
        )
    
    def store_documents(self, documents: list):
        """Upsert documents into the vector store."""
        db = self.client.database('knowledge_base')
        collection = db.collection('documentation')
        
        docs_to_insert = [
            Document(
                id=doc["url"],
                text=doc["text"],
                title=doc["title"]
            )
            for doc in documents
        ]
        
        collection.upsert(documents=docs_to_insert, build_index=True)
        print(f"Stored {len(docs_to_insert)} documents")
    
    def semantic_search(self, query: str, limit: int = 3):
        """Perform semantic search using text query."""
        db = self.client.database('knowledge_base')
        collection = db.collection('documentation')
        
        results = collection.searchByText(
            embedding_items=[query],
            limit=limit
        )
        
        return results.get('documents', [])

Important Configuration Notes

The vector field should not have an index created separately—the vector index handles this
Ensure the embedding model dimension matches the collection's vector dimension
Dynamic schema support allows arbitrary fields beyond id and vector

Part 3: LLM Chat Interface

Dependencies

pip install transformers torch streamlit gradio sentencepiece accelerate

Chat Application

Create chat_app.py:

import os
import streamlit as st
import torch
from transformers import AutoModel, AutoTokenizer

MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b')
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

st.set_page_config(
    page_title="AI Knowledge Assistant",
    layout="wide"
)

@st.cache_resource
def load_model():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
    
    if DEVICE == 'cuda':
        model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True)
        model = model.half().to(DEVICE)
    else:
        model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True)
        model = model.float().to(DEVICE)
    
    model.eval()
    return tokenizer, model

tokenizer, model = load_model()

if "conversation_history" not in st.session_state:
    st.session_state.conversation_history = []
if "kv_cache" not in st.session_state:
    st.session_state.kv_cache = None

with st.sidebar:
    st.header("Parameters")
    max_tokens = st.slider("Max Length", 0, 32768, 8192)
    top_p = st.slider("Top P", 0.0, 1.0, 0.8)
    temperature = st.slider("Temperature", 0.0, 1.0, 0.8)
    
    if st.button("Clear History"):
        st.session_state.conversation_history = []
        st.session_state.kv_cache = None
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

for msg in st.session_state.conversation_history:
    role = "user" if msg["role"] == "user" else "assistant"
    with st.chat_message(role):
        st.markdown(msg["content"])

user_input = st.chat_input("Ask your question...")

if user_input:
    with st.chat_message("user"):
        st.markdown(user_input)
    
    with st.chat_message("assistant"):
        response_placeholder = st.empty()
        
        current_history = []
        
        for response, history, kv_cache in model.stream_chat(
            tokenizer,
            user_input,
            current_history,
            past_key_values=st.session_state.kv_cache,
            max_length=max_tokens,
            top_p=top_p,
            temperature=temperature,
            return_past_key_values=True
        ):
            response_placeholder.markdown(response)
        
        st.session_state.conversation_history = history
        st.session_state.kv_cache = kv_cache

Part 4: Integrating Vector Search with LLM

Knowledge-Augmented Response Generation

Extend the chat application with vector search capability:

from vector_store import VectorStoreManager

VDB_CONFIG = {
    'endpoint': 'http://your-instance.clb.region.tencentclb.com:50000',
    'username': 'root',
    'api_key': 'your-api-key'
}

vdb_manager = VectorStoreManager(**VDB_CONFIG)

def format_context(search_results):
    """Convert search results into a context string."""
    context_parts = []
    for doc_list in search_results:
        for doc in doc_list:
            context_parts.append(doc["text"])
    return "\n".join(context_parts)[:20000]

def generate_augmented_prompt(user_query: str, context: str) -> str:
    """Create a prompt that includes retrieved knowledge."""
    return f"Based on the following information, answer the question.\n\nContext:\n{context}\n\nQuestion: {user_query}"

with st.sidebar:
    st.header("Conversation Mode")
    mode = st.selectbox(
        "Select Mode",
        ["Knowledge Base Q&A", "Standard Chat"],
        key="mode_selector"
    )

if user_input:
    if mode == "Knowledge Base Q&A":
        search_results = vdb_manager.semantic_search(user_input, limit=3)
        context = format_context(search_results)
        prompt = generate_augmented_prompt(user_input, context)
        
        st.session_state.conversation_history = []
    else:
        prompt = user_input
    
    with st.chat_message("assistant"):
        response_placeholder = st.empty()
        final_response = ""
        
        for response, _, _ in model.stream_chat(
            tokenizer,
            prompt,
            st.session_state.conversation_history if mode != "Knowledge Base Q&A" else [],
            past_key_values=st.session_state.kv_cache if mode != "Knowledge Base Q&A" else None,
            max_length=max_tokens,
            top_p=top_p,
            temperature=temperature,
            return_past_key_values=True
        ):
            final_response = response
            response_placeholder.markdown(response)
        
        if mode == "Knowledge Base Q&A":
            references = "\n\n**References:**\n"
            for doc_list in search_results:
                for doc in doc_list:
                    references += f"- [{doc['title']}]({doc['id']})\n"
            st.markdown(references)
        else:
            st.session_state.conversation_history = []

Architecture Summary

The system operates through three interconnected layers:

Data Layer: Web crawler aggregates documentation content and stores it in Tencent Cloud VectorDB with semantic embeddings
Retrieval Layer: Semantic search extracts relevant knowledge chunks based on user queries
Generation Layer: LLM synthesizes retrieved context with natural language generation

This RAG (Retrieval-Augmented Generation) architecture significantly reduces hallucination by grounding responses in retrieved evidence while maintaining the conversational capabilities of modern LLMs.

Tags: Vector-Database LLM knowledge-base web-crawler RAG

Posted on Wed, 13 May 2026 04:56:58 +0000 by ozzysworld

Freaks City