ljy5946 commited on
Commit
1c34cf7
·
verified ·
1 Parent(s): 091292c

Delete vector_build/build_vector_store.py

Browse files
Files changed (1) hide show
  1. vector_build/build_vector_store.py +0 -47
vector_build/build_vector_store.py DELETED
@@ -1,47 +0,0 @@
1
- import os
2
- from langchain.text_splitter import RecursiveCharacterTextSplitter
3
- from langchain_community.embeddings import HuggingFaceEmbeddings
4
- from langchain_community.vectorstores import Chroma
5
-
6
- # 1. 设置路径
7
- BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # 当前脚本所在路径
8
- PERSIST_DIR = os.path.abspath(os.path.join(BASE_DIR, "../vector_store")) # 向量库存储路径
9
- SOURCE_DIR = BASE_DIR # 你的 .md 文件就在当前 vector_build/ 目录下
10
-
11
- # 2. 加载 Embedding 模型
12
- embed_model = HuggingFaceEmbeddings(
13
- model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
14
- )
15
-
16
- # 3. 加载 Markdown 文件 & 切分为小段
17
- text_splitter = RecursiveCharacterTextSplitter(
18
- chunk_size=500, chunk_overlap=50
19
- )
20
-
21
- docs = []
22
- for fname in os.listdir(SOURCE_DIR):
23
- if fname.endswith(".md"):
24
- with open(os.path.join(SOURCE_DIR, fname), "r", encoding="utf-8") as f:
25
- raw_text = f.read()
26
- chunks = text_splitter.split_text(raw_text)
27
- for chunk in chunks:
28
- docs.append({
29
- "text": chunk,
30
- "source": fname
31
- })
32
-
33
- print(f"🐣 共切分出 {len(docs)} 个文本块,准备向量化...")
34
-
35
- # 4. 创建 Chroma 向量库
36
- texts = [d["text"] for d in docs]
37
- metas = [{"source": d["source"]} for d in docs]
38
-
39
- vectordb = Chroma.from_texts(
40
- texts=texts,
41
- embedding=embed_model,
42
- metadatas=metas,
43
- persist_directory=PERSIST_DIR
44
- )
45
- vectordb.persist()
46
-
47
- print(f"🎉 向量库生成完毕,已保存在:{PERSIST_DIR}")