Spaces:

ljy5946
/

Smart_Learning_Assistant

Sleeping

App Files Files Community

ljy5946 commited on Jun 8

Commit

1c34cf7

verified ·

1 Parent(s): 091292c

Delete vector_build/build_vector_store.py

Browse files

Files changed (1) hide show

vector_build/build_vector_store.py +0 -47

vector_build/build_vector_store.py DELETED Viewed

@@ -1,47 +0,0 @@
-import os
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_community.vectorstores import Chroma
-# 1. 设置路径
-BASE_DIR = os.path.dirname(os.path.abspath(__file__))  # 当前脚本所在路径
-PERSIST_DIR = os.path.abspath(os.path.join(BASE_DIR, "../vector_store"))  # 向量库存储路径
-SOURCE_DIR = BASE_DIR  # 你的 .md 文件就在当前 vector_build/ 目录下
-# 2. 加载 Embedding 模型
-embed_model = HuggingFaceEmbeddings(
-    model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
-)
-# 3. 加载 Markdown 文件 & 切分为小段
-text_splitter = RecursiveCharacterTextSplitter(
-    chunk_size=500, chunk_overlap=50
-)
-docs = []
-for fname in os.listdir(SOURCE_DIR):
-    if fname.endswith(".md"):
-        with open(os.path.join(SOURCE_DIR, fname), "r", encoding="utf-8") as f:
-            raw_text = f.read()
-        chunks = text_splitter.split_text(raw_text)
-        for chunk in chunks:
-            docs.append({
-                "text": chunk,
-                "source": fname
-            })
-print(f"🐣 共切分出 {len(docs)} 个文本块，准备向量化...")
-# 4. 创建 Chroma 向量库
-texts = [d["text"] for d in docs]
-metas = [{"source": d["source"]} for d in docs]
-vectordb = Chroma.from_texts(
-    texts=texts,
-    embedding=embed_model,
-    metadatas=metas,
-    persist_directory=PERSIST_DIR
-)
-vectordb.persist()
-print(f"🎉 向量库生成完毕，已保存在：{PERSIST_DIR}")