Spaces:

ljy5946
/

Smart_Learning_Assistant

Sleeping

App Files Files Community

Smart_Learning_Assistant / build_index.py

ljy5946

Upload build_index.py

0fc94ca verified 17 days ago

raw

history blame contribute delete

2.31 kB

	# build_index.py

	import os
	import glob
	import sys
	from langchain_community.document_loaders import TextLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_chroma import Chroma

	def load_markdown_files(docs_dir):
	abs_docs_dir = os.path.abspath(docs_dir)
	print(f"👉 正在扫描目录: {abs_docs_dir}")
	file_paths = glob.glob(os.path.join(abs_docs_dir, "*", ".md"), recursive=True)
	if not file_paths:
	print("❌ 没有找到任何 Markdown 文件，请检查 docs_dir 配置！")
	sys.exit(1)

	docs = []
	for path in file_paths:
	loader = TextLoader(path, encoding="utf-8")
	loaded = loader.load()
	print(f" - {os.path.basename(path)}: 加载 {len(loaded)} 段原始文档")
	docs.extend(loaded)
	print(f"✅ 总共加载 {len(docs)} 段原始文档")
	return docs

	def split_documents(docs, chunk_size=1000, chunk_overlap=200):
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap
	)
	new_docs = splitter.split_documents(docs)
	print(f"✅ 分片完成：从 {len(docs)} 段 -> {len(new_docs)} 个 chunk")
	return new_docs

	def build_index(docs, persist_dir, model_name):
	if not docs:
	print("❌ 没有任何 chunk 可供写入，请检查前面步骤！")
	sys.exit(1)

	os.makedirs(persist_dir, exist_ok=True)
	emb = HuggingFaceEmbeddings(model_name=model_name)
	db = Chroma(
	persist_directory=persist_dir,
	embedding_function=emb,
	)

	print("👉 正在写入向量库（自动持久化）……")
	db.add_documents(docs)

	# 直接访问底层 collection 统计
	count = db._collection.count()
	print(f"✅ 已写入 {count} 条 embeddings 到 '{persist_dir}'")

	def main():
	docs_dir = "./markdown_docs"
	persist_dir = "./vector_store"
	model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

	raw_docs = load_markdown_files(docs_dir)
	docs = split_documents(raw_docs, chunk_size=1000, chunk_overlap=200)
	build_index(docs, persist_dir, model_name)

	if __name__ == "__main__":
	main()