Smart_Learning_Assistant / build_index.py
ljy5946's picture
Upload build_index.py
0fc94ca verified
# build_index.py
import os
import glob
import sys
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
def load_markdown_files(docs_dir):
abs_docs_dir = os.path.abspath(docs_dir)
print(f"👉 正在扫描目录: {abs_docs_dir}")
file_paths = glob.glob(os.path.join(abs_docs_dir, "**", "*.md"), recursive=True)
if not file_paths:
print("❌ 没有找到任何 Markdown 文件,请检查 docs_dir 配置!")
sys.exit(1)
docs = []
for path in file_paths:
loader = TextLoader(path, encoding="utf-8")
loaded = loader.load()
print(f" - {os.path.basename(path)}: 加载 {len(loaded)} 段原始文档")
docs.extend(loaded)
print(f"✅ 总共加载 {len(docs)} 段原始文档")
return docs
def split_documents(docs, chunk_size=1000, chunk_overlap=200):
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
new_docs = splitter.split_documents(docs)
print(f"✅ 分片完成:从 {len(docs)} 段 -> {len(new_docs)} 个 chunk")
return new_docs
def build_index(docs, persist_dir, model_name):
if not docs:
print("❌ 没有任何 chunk 可供写入,请检查前面步骤!")
sys.exit(1)
os.makedirs(persist_dir, exist_ok=True)
emb = HuggingFaceEmbeddings(model_name=model_name)
db = Chroma(
persist_directory=persist_dir,
embedding_function=emb,
)
print("👉 正在写入向量库(自动持久化)……")
db.add_documents(docs)
# 直接访问底层 collection 统计
count = db._collection.count()
print(f"✅ 已写入 {count} 条 embeddings 到 '{persist_dir}'")
def main():
docs_dir = "./markdown_docs"
persist_dir = "./vector_store"
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
raw_docs = load_markdown_files(docs_dir)
docs = split_documents(raw_docs, chunk_size=1000, chunk_overlap=200)
build_index(docs, persist_dir, model_name)
if __name__ == "__main__":
main()