Spaces:
Sleeping
Sleeping
File size: 2,314 Bytes
0fc94ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# build_index.py
import os
import glob
import sys
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
def load_markdown_files(docs_dir):
abs_docs_dir = os.path.abspath(docs_dir)
print(f"👉 正在扫描目录: {abs_docs_dir}")
file_paths = glob.glob(os.path.join(abs_docs_dir, "**", "*.md"), recursive=True)
if not file_paths:
print("❌ 没有找到任何 Markdown 文件,请检查 docs_dir 配置!")
sys.exit(1)
docs = []
for path in file_paths:
loader = TextLoader(path, encoding="utf-8")
loaded = loader.load()
print(f" - {os.path.basename(path)}: 加载 {len(loaded)} 段原始文档")
docs.extend(loaded)
print(f"✅ 总共加载 {len(docs)} 段原始文档")
return docs
def split_documents(docs, chunk_size=1000, chunk_overlap=200):
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
new_docs = splitter.split_documents(docs)
print(f"✅ 分片完成:从 {len(docs)} 段 -> {len(new_docs)} 个 chunk")
return new_docs
def build_index(docs, persist_dir, model_name):
if not docs:
print("❌ 没有任何 chunk 可供写入,请检查前面步骤!")
sys.exit(1)
os.makedirs(persist_dir, exist_ok=True)
emb = HuggingFaceEmbeddings(model_name=model_name)
db = Chroma(
persist_directory=persist_dir,
embedding_function=emb,
)
print("👉 正在写入向量库(自动持久化)……")
db.add_documents(docs)
# 直接访问底层 collection 统计
count = db._collection.count()
print(f"✅ 已写入 {count} 条 embeddings 到 '{persist_dir}'")
def main():
docs_dir = "./markdown_docs"
persist_dir = "./vector_store"
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
raw_docs = load_markdown_files(docs_dir)
docs = split_documents(raw_docs, chunk_size=1000, chunk_overlap=200)
build_index(docs, persist_dir, model_name)
if __name__ == "__main__":
main()
|