File size: 2,314 Bytes
0fc94ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# build_index.py

import os
import glob
import sys
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

def load_markdown_files(docs_dir):
    abs_docs_dir = os.path.abspath(docs_dir)
    print(f"👉 正在扫描目录: {abs_docs_dir}")
    file_paths = glob.glob(os.path.join(abs_docs_dir, "**", "*.md"), recursive=True)
    if not file_paths:
        print("❌ 没有找到任何 Markdown 文件,请检查 docs_dir 配置!")
        sys.exit(1)

    docs = []
    for path in file_paths:
        loader = TextLoader(path, encoding="utf-8")
        loaded = loader.load()
        print(f"  - {os.path.basename(path)}: 加载 {len(loaded)} 段原始文档")
        docs.extend(loaded)
    print(f"✅ 总共加载 {len(docs)} 段原始文档")
    return docs

def split_documents(docs, chunk_size=1000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    new_docs = splitter.split_documents(docs)
    print(f"✅ 分片完成:从 {len(docs)} 段 -> {len(new_docs)} 个 chunk")
    return new_docs

def build_index(docs, persist_dir, model_name):
    if not docs:
        print("❌ 没有任何 chunk 可供写入,请检查前面步骤!")
        sys.exit(1)

    os.makedirs(persist_dir, exist_ok=True)
    emb = HuggingFaceEmbeddings(model_name=model_name)
    db = Chroma(
        persist_directory=persist_dir,
        embedding_function=emb,
    )

    print("👉 正在写入向量库(自动持久化)……")
    db.add_documents(docs)

    # 直接访问底层 collection 统计
    count = db._collection.count()
    print(f"✅ 已写入 {count} 条 embeddings 到 '{persist_dir}'")

def main():
    docs_dir    = "./markdown_docs"
    persist_dir = "./vector_store"
    model_name  = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

    raw_docs = load_markdown_files(docs_dir)
    docs     = split_documents(raw_docs, chunk_size=1000, chunk_overlap=200)
    build_index(docs, persist_dir, model_name)

if __name__ == "__main__":
    main()