ljy5946 commited on
Commit
a581ee8
·
verified ·
1 Parent(s): 3d1eeca

Upload 5 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
build_vector_store.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import Chroma
5
+
6
+ # 1. 设置路径
7
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # 当前脚本所在路径
8
+ PERSIST_DIR = os.path.abspath(os.path.join(BASE_DIR, "../vector_store")) # 向量库存储路径
9
+ SOURCE_DIR = BASE_DIR # 你的 .md 文件就在当前 vector_build/ 目录下
10
+
11
+ # 2. 加载 Embedding 模型
12
+ embed_model = HuggingFaceEmbeddings(
13
+ model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
14
+ )
15
+
16
+ # 3. 加载 Markdown 文件 & 切分为小段
17
+ text_splitter = RecursiveCharacterTextSplitter(
18
+ chunk_size=500, chunk_overlap=50
19
+ )
20
+
21
+ docs = []
22
+ for fname in os.listdir(SOURCE_DIR):
23
+ if fname.endswith(".md"):
24
+ with open(os.path.join(SOURCE_DIR, fname), "r", encoding="utf-8") as f:
25
+ raw_text = f.read()
26
+ chunks = text_splitter.split_text(raw_text)
27
+ for chunk in chunks:
28
+ docs.append({
29
+ "text": chunk,
30
+ "source": fname
31
+ })
32
+
33
+ print(f"🐣 共切分出 {len(docs)} 个文本块,准备向量化...")
34
+
35
+ # 4. 创建 Chroma 向量库
36
+ texts = [d["text"] for d in docs]
37
+ metas = [{"source": d["source"]} for d in docs]
38
+
39
+ vectordb = Chroma.from_texts(
40
+ texts=texts,
41
+ embedding=embed_model,
42
+ metadatas=metas,
43
+ persist_directory=PERSIST_DIR
44
+ )
45
+ vectordb.persist()
46
+
47
+ print(f"🎉 向量库生成完毕,已保存在:{PERSIST_DIR}")
chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f7a7a249bf3b1b4e7dd730cde0985cdf0220c849e5deeca31d4df7c912720f2
3
+ size 22417408
index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d2130d448d160bb76ed65984a84859d3d6b645d4a3bf09971b32e2eb4defe63
3
+ size 213668
高等数学上册.md ADDED
The diff for this file is too large to render. See raw diff
 
高等数学下册.md ADDED
The diff for this file is too large to render. See raw diff