# build_index.py import os import glob import sys from langchain_community.document_loaders import TextLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_chroma import Chroma def load_markdown_files(docs_dir): abs_docs_dir = os.path.abspath(docs_dir) print(f"👉 正在扫描目录: {abs_docs_dir}") file_paths = glob.glob(os.path.join(abs_docs_dir, "**", "*.md"), recursive=True) if not file_paths: print("❌ 没有找到任何 Markdown 文件,请检查 docs_dir 配置!") sys.exit(1) docs = [] for path in file_paths: loader = TextLoader(path, encoding="utf-8") loaded = loader.load() print(f" - {os.path.basename(path)}: 加载 {len(loaded)} 段原始文档") docs.extend(loaded) print(f"✅ 总共加载 {len(docs)} 段原始文档") return docs def split_documents(docs, chunk_size=1000, chunk_overlap=200): splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) new_docs = splitter.split_documents(docs) print(f"✅ 分片完成:从 {len(docs)} 段 -> {len(new_docs)} 个 chunk") return new_docs def build_index(docs, persist_dir, model_name): if not docs: print("❌ 没有任何 chunk 可供写入,请检查前面步骤!") sys.exit(1) os.makedirs(persist_dir, exist_ok=True) emb = HuggingFaceEmbeddings(model_name=model_name) db = Chroma( persist_directory=persist_dir, embedding_function=emb, ) print("👉 正在写入向量库(自动持久化)……") db.add_documents(docs) # 直接访问底层 collection 统计 count = db._collection.count() print(f"✅ 已写入 {count} 条 embeddings 到 '{persist_dir}'") def main(): docs_dir = "./markdown_docs" persist_dir = "./vector_store" model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" raw_docs = load_markdown_files(docs_dir) docs = split_documents(raw_docs, chunk_size=1000, chunk_overlap=200) build_index(docs, persist_dir, model_name) if __name__ == "__main__": main()