ljy5946 commited on
Commit
0fc94ca
·
verified ·
1 Parent(s): a4752c8

Upload build_index.py

Browse files
Files changed (1) hide show
  1. build_index.py +68 -0
build_index.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build_index.py
2
+
3
+ import os
4
+ import glob
5
+ import sys
6
+ from langchain_community.document_loaders import TextLoader
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_huggingface import HuggingFaceEmbeddings
9
+ from langchain_chroma import Chroma
10
+
11
+ def load_markdown_files(docs_dir):
12
+ abs_docs_dir = os.path.abspath(docs_dir)
13
+ print(f"👉 正在扫描目录: {abs_docs_dir}")
14
+ file_paths = glob.glob(os.path.join(abs_docs_dir, "**", "*.md"), recursive=True)
15
+ if not file_paths:
16
+ print("❌ 没有找到任何 Markdown 文件,请检查 docs_dir 配置!")
17
+ sys.exit(1)
18
+
19
+ docs = []
20
+ for path in file_paths:
21
+ loader = TextLoader(path, encoding="utf-8")
22
+ loaded = loader.load()
23
+ print(f" - {os.path.basename(path)}: 加载 {len(loaded)} 段原始文档")
24
+ docs.extend(loaded)
25
+ print(f"✅ 总共加载 {len(docs)} 段原始文档")
26
+ return docs
27
+
28
+ def split_documents(docs, chunk_size=1000, chunk_overlap=200):
29
+ splitter = RecursiveCharacterTextSplitter(
30
+ chunk_size=chunk_size,
31
+ chunk_overlap=chunk_overlap
32
+ )
33
+ new_docs = splitter.split_documents(docs)
34
+ print(f"✅ 分片完成:从 {len(docs)} 段 -> {len(new_docs)} 个 chunk")
35
+ return new_docs
36
+
37
+ def build_index(docs, persist_dir, model_name):
38
+ if not docs:
39
+ print("❌ 没有任何 chunk 可供写入,请检查前面步骤!")
40
+ sys.exit(1)
41
+
42
+ os.makedirs(persist_dir, exist_ok=True)
43
+ emb = HuggingFaceEmbeddings(model_name=model_name)
44
+ db = Chroma(
45
+ persist_directory=persist_dir,
46
+ embedding_function=emb,
47
+ )
48
+
49
+ print("👉 正在写入向量库(自动持久化)……")
50
+ db.add_documents(docs)
51
+
52
+ # 直接访问底层 collection 统计
53
+ count = db._collection.count()
54
+ print(f"✅ 已写入 {count} 条 embeddings 到 '{persist_dir}'")
55
+
56
+ def main():
57
+ docs_dir = "./markdown_docs"
58
+ persist_dir = "./vector_store"
59
+ model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
60
+
61
+ raw_docs = load_markdown_files(docs_dir)
62
+ docs = split_documents(raw_docs, chunk_size=1000, chunk_overlap=200)
63
+ build_index(docs, persist_dir, model_name)
64
+
65
+ if __name__ == "__main__":
66
+ main()
67
+
68
+