ljy5946 commited on
Commit
75d6e02
·
verified ·
1 Parent(s): 1c34cf7

Upload build_vector_store.py

Browse files
Files changed (1) hide show
  1. vector_build/build_vector_store.py +50 -0
vector_build/build_vector_store.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import Chroma
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain.schema import Document
7
+
8
+ # ====== 1. 设置路径 ======
9
+ md_folder = "../" # markdown 文件所在目录
10
+ persist_path = "../vector_store" # 向量库保存路径
11
+
12
+ # ====== 2. 清空旧向量库(如存在) ======
13
+ if os.path.exists(persist_path):
14
+ print("⚠️ 检测到旧向量库,自动删除重建…")
15
+ shutil.rmtree(persist_path)
16
+
17
+ # ====== 3. 加载 Markdown 文件 ======
18
+ docs = []
19
+ for filename in os.listdir(md_folder):
20
+ if filename.endswith(".md"):
21
+ file_path = os.path.join(md_folder, filename)
22
+ with open(file_path, "r", encoding="utf-8") as f:
23
+ text = f.read()
24
+ docs.append(Document(page_content=text, metadata={"source": filename}))
25
+
26
+ if not docs:
27
+ print("❌ 未发现任何 Markdown 文件,请检查路径和文件名")
28
+ exit()
29
+
30
+ # ====== 4. 分割文本块 ======
31
+ splitter = RecursiveCharacterTextSplitter(
32
+ chunk_size=500,
33
+ chunk_overlap=100,
34
+ separators=["\n\n", "\n", "。", ".", ",", ","],
35
+ )
36
+ split_docs = splitter.split_documents(docs)
37
+ print(f"🐣 共切分出 {len(split_docs)} 段文本,准备向量化…")
38
+
39
+ # ====== 5. 构建向量库并保存 ======
40
+ embedding_model = HuggingFaceEmbeddings(
41
+ model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
42
+ )
43
+ vectordb = Chroma.from_documents(
44
+ documents=split_docs,
45
+ embedding=embedding_model,
46
+ persist_directory=persist_path,
47
+ )
48
+
49
+ vectordb.persist()
50
+ print(f"✅ 向量库已保存到:{persist_path}")