Spaces:
Sleeping
Sleeping
import os | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import Chroma | |
# 1. 设置路径 | |
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # 当前脚本所在路径 | |
PERSIST_DIR = os.path.abspath(os.path.join(BASE_DIR, "../vector_store")) # 向量库存储路径 | |
SOURCE_DIR = BASE_DIR # 你的 .md 文件就在当前 vector_build/ 目录下 | |
# 2. 加载 Embedding 模型 | |
embed_model = HuggingFaceEmbeddings( | |
model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2" | |
) | |
# 3. 加载 Markdown 文件 & 切分为小段 | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=500, chunk_overlap=50 | |
) | |
docs = [] | |
for fname in os.listdir(SOURCE_DIR): | |
if fname.endswith(".md"): | |
with open(os.path.join(SOURCE_DIR, fname), "r", encoding="utf-8") as f: | |
raw_text = f.read() | |
chunks = text_splitter.split_text(raw_text) | |
for chunk in chunks: | |
docs.append({ | |
"text": chunk, | |
"source": fname | |
}) | |
print(f"🐣 共切分出 {len(docs)} 个文本块,准备向量化...") | |
# 4. 创建 Chroma 向量库 | |
texts = [d["text"] for d in docs] | |
metas = [{"source": d["source"]} for d in docs] | |
vectordb = Chroma.from_texts( | |
texts=texts, | |
embedding=embed_model, | |
metadatas=metas, | |
persist_directory=PERSIST_DIR | |
) | |
vectordb.persist() | |
print(f"🎉 向量库生成完毕,已保存在:{PERSIST_DIR}") | |