Spaces:
Sleeping
Sleeping
# build_index.py | |
import os | |
import glob | |
import sys | |
from langchain_community.document_loaders import TextLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_chroma import Chroma | |
def load_markdown_files(docs_dir): | |
abs_docs_dir = os.path.abspath(docs_dir) | |
print(f"👉 正在扫描目录: {abs_docs_dir}") | |
file_paths = glob.glob(os.path.join(abs_docs_dir, "**", "*.md"), recursive=True) | |
if not file_paths: | |
print("❌ 没有找到任何 Markdown 文件,请检查 docs_dir 配置!") | |
sys.exit(1) | |
docs = [] | |
for path in file_paths: | |
loader = TextLoader(path, encoding="utf-8") | |
loaded = loader.load() | |
print(f" - {os.path.basename(path)}: 加载 {len(loaded)} 段原始文档") | |
docs.extend(loaded) | |
print(f"✅ 总共加载 {len(docs)} 段原始文档") | |
return docs | |
def split_documents(docs, chunk_size=1000, chunk_overlap=200): | |
splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap | |
) | |
new_docs = splitter.split_documents(docs) | |
print(f"✅ 分片完成:从 {len(docs)} 段 -> {len(new_docs)} 个 chunk") | |
return new_docs | |
def build_index(docs, persist_dir, model_name): | |
if not docs: | |
print("❌ 没有任何 chunk 可供写入,请检查前面步骤!") | |
sys.exit(1) | |
os.makedirs(persist_dir, exist_ok=True) | |
emb = HuggingFaceEmbeddings(model_name=model_name) | |
db = Chroma( | |
persist_directory=persist_dir, | |
embedding_function=emb, | |
) | |
print("👉 正在写入向量库(自动持久化)……") | |
db.add_documents(docs) | |
# 直接访问底层 collection 统计 | |
count = db._collection.count() | |
print(f"✅ 已写入 {count} 条 embeddings 到 '{persist_dir}'") | |
def main(): | |
docs_dir = "./markdown_docs" | |
persist_dir = "./vector_store" | |
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" | |
raw_docs = load_markdown_files(docs_dir) | |
docs = split_documents(raw_docs, chunk_size=1000, chunk_overlap=200) | |
build_index(docs, persist_dir, model_name) | |
if __name__ == "__main__": | |
main() | |