import os from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma # 1. 设置路径 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # 当前脚本所在路径 PERSIST_DIR = os.path.abspath(os.path.join(BASE_DIR, "../vector_store")) # 向量库存储路径 SOURCE_DIR = BASE_DIR # 你的 .md 文件就在当前 vector_build/ 目录下 # 2. 加载 Embedding 模型 embed_model = HuggingFaceEmbeddings( model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2" ) # 3. 加载 Markdown 文件 & 切分为小段 text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50 ) docs = [] for fname in os.listdir(SOURCE_DIR): if fname.endswith(".md"): with open(os.path.join(SOURCE_DIR, fname), "r", encoding="utf-8") as f: raw_text = f.read() chunks = text_splitter.split_text(raw_text) for chunk in chunks: docs.append({ "text": chunk, "source": fname }) print(f"🐣 共切分出 {len(docs)} 个文本块,准备向量化...") # 4. 创建 Chroma 向量库 texts = [d["text"] for d in docs] metas = [{"source": d["source"]} for d in docs] vectordb = Chroma.from_texts( texts=texts, embedding=embed_model, metadatas=metas, persist_directory=PERSIST_DIR ) vectordb.persist() print(f"🎉 向量库生成完毕,已保存在:{PERSIST_DIR}")