novamysticX commited on
Commit
854cf7b
·
verified ·
1 Parent(s): 4a83fd2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -25
app.py CHANGED
@@ -1,54 +1,208 @@
1
- from fastapi import FastAPI, Request
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from pydantic import BaseModel
3
- from transformers import AutoModel, AutoTokenizer
 
4
  import torch
 
 
 
5
 
6
- app = FastAPI()
 
 
 
 
 
 
7
 
8
- model_id = "Qwen/Qwen3-Embedding-0.6B"
 
9
 
10
- # Load tokenizer
11
- tokenizer = AutoTokenizer.from_pretrained(model_id)
 
12
 
13
- # Load model with GPU if available, else CPU
14
- use_gpu = torch.cuda.is_available()
15
 
16
- if use_gpu:
17
- print("CUDA is available, loading model with 4-bit quantization on GPU.")
18
  model = AutoModel.from_pretrained(
19
- model_id,
20
  device_map="auto",
21
  torch_dtype=torch.float16,
22
  load_in_4bit=True
23
  )
24
  else:
25
- print("CUDA not available, loading model without 4-bit quantization on CPU.")
26
  model = AutoModel.from_pretrained(
27
- model_id,
28
  device_map="cpu",
29
  torch_dtype=torch.float32
30
  )
31
 
32
  model.eval()
 
 
 
 
 
33
 
34
- class TextInput(BaseModel):
35
- text: str
36
 
37
- @app.post("/embed")
38
- async def embed_text(input: TextInput):
39
- inputs = tokenizer(input.text, return_tensors="pt", truncation=True, max_length=512)
 
40
 
41
- # Move input tensors to same device as model
42
- device = next(model.parameters()).device
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  inputs = {k: v.to(device) for k, v in inputs.items()}
44
 
45
  with torch.no_grad():
46
  outputs = model(**inputs)
47
- embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling
48
-
49
- # Convert to list for JSON serialization
50
- return {"embedding": embeddings[0].cpu().tolist()}
51
-
52
 
 
53
 
54
 
 
1
+ # from fastapi import FastAPI, Request
2
+ # from pydantic import BaseModel
3
+ # from transformers import AutoModel, AutoTokenizer
4
+ # import torch
5
+
6
+ # app = FastAPI()
7
+
8
+ # model_id = "Qwen/Qwen3-Embedding-0.6B"
9
+
10
+ # # Load tokenizer
11
+ # tokenizer = AutoTokenizer.from_pretrained(model_id)
12
+
13
+ # # Load model with GPU if available, else CPU
14
+ # use_gpu = torch.cuda.is_available()
15
+
16
+ # if use_gpu:
17
+ # print("CUDA is available, loading model with 4-bit quantization on GPU.")
18
+ # model = AutoModel.from_pretrained(
19
+ # model_id,
20
+ # device_map="auto",
21
+ # torch_dtype=torch.float16,
22
+ # load_in_4bit=True
23
+ # )
24
+ # else:
25
+ # print("CUDA not available, loading model without 4-bit quantization on CPU.")
26
+ # model = AutoModel.from_pretrained(
27
+ # model_id,
28
+ # device_map="cpu",
29
+ # torch_dtype=torch.float32
30
+ # )
31
+
32
+ # model.eval()
33
+
34
+ # class TextInput(BaseModel):
35
+ # text: str
36
+
37
+ # @app.post("/embed")
38
+ # async def embed_text(input: TextInput):
39
+ # inputs = tokenizer(input.text, return_tensors="pt", truncation=True, max_length=512)
40
+
41
+ # # Move input tensors to same device as model
42
+ # device = next(model.parameters()).device
43
+ # inputs = {k: v.to(device) for k, v in inputs.items()}
44
+
45
+ # with torch.no_grad():
46
+ # outputs = model(**inputs)
47
+ # embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling
48
+
49
+ # # Convert to list for JSON serialization
50
+ # return {"embedding": embeddings[0].cpu().tolist()}
51
+
52
+ # from fastapi import FastAPI
53
+ # from pydantic import BaseModel
54
+ # from typing import List
55
+ # from transformers import AutoTokenizer, AutoModel
56
+ # import torch
57
+ # import torch.nn.functional as F
58
+
59
+ # app = FastAPI()
60
+
61
+ # # Model config
62
+ # MODEL_ID = "Qwen/Qwen3-Embedding-0.6B"
63
+ # USE_GPU = torch.cuda.is_available()
64
+
65
+ # # Load tokenizer
66
+ # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side='left')
67
+
68
+ # # Load model with appropriate settings
69
+ # if USE_GPU:
70
+ # print("🔋 Loading model on GPU with 4-bit quantization...")
71
+ # model = AutoModel.from_pretrained(
72
+ # MODEL_ID,
73
+ # device_map="auto",
74
+ # torch_dtype=torch.float16,
75
+ # load_in_4bit=True
76
+ # )
77
+ # else:
78
+ # print("🧠 Loading model on CPU...")
79
+ # model = AutoModel.from_pretrained(
80
+ # MODEL_ID,
81
+ # device_map="cpu",
82
+ # torch_dtype=torch.float32
83
+ # )
84
+
85
+ # model.eval()
86
+ # device = next(model.parameters()).device
87
+
88
+ # # Input schema
89
+ # class EmbedRequest(BaseModel):
90
+ # texts: List[str]
91
+
92
+ # # Output schema
93
+ # class EmbedResponse(BaseModel):
94
+ # embeddings: List[List[float]]
95
+
96
+ # # Masked mean pooling (ignores padded tokens)
97
+ # def masked_mean_pooling(last_hidden_state, attention_mask):
98
+ # mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
99
+ # masked_embeddings = last_hidden_state * mask
100
+ # summed = masked_embeddings.sum(dim=1)
101
+ # counts = mask.sum(dim=1)
102
+ # return summed / counts.clamp(min=1e-9)
103
+
104
+ # @app.post("/embed", response_model=EmbedResponse)
105
+ # async def embed_texts(request: EmbedRequest):
106
+ # # Tokenize input texts
107
+ # inputs = tokenizer(
108
+ # request.texts,
109
+ # return_tensors="pt",
110
+ # padding=True,
111
+ # truncation=True,
112
+ # max_length=32768 # Qwen supports long sequences
113
+ # )
114
+ # inputs = {k: v.to(device) for k, v in inputs.items()}
115
+
116
+ # # Get embeddings
117
+ # with torch.no_grad():
118
+ # outputs = model(**inputs)
119
+ # pooled = masked_mean_pooling(outputs.last_hidden_state, inputs['attention_mask'])
120
+ # normalized = F.normalize(pooled, p=2, dim=1)
121
+
122
+ # return {"embeddings": normalized.cpu().tolist()}
123
+
124
+ from fastapi import FastAPI, HTTPException, Header, Depends
125
  from pydantic import BaseModel
126
+ from typing import List, Optional
127
+ from transformers import AutoTokenizer, AutoModel
128
  import torch
129
+ import torch.nn.functional as F
130
+ import os
131
+ from dotenv import load_dotenv
132
 
133
+ # Load environment variables from .env (if present)
134
+ load_dotenv()
135
+
136
+ # Load API key from environment variable
137
+ API_KEY = os.environ.get("API_KEY")
138
+ if not API_KEY:
139
+ raise RuntimeError("❌ EMBEDDING_API_KEY not set in environment!")
140
 
141
+ # Initialize FastAPI
142
+ app = FastAPI()
143
 
144
+ # Load tokenizer & model
145
+ MODEL_ID = "Qwen/Qwen3-Embedding-0.6B"
146
+ USE_GPU = torch.cuda.is_available()
147
 
148
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side='left')
 
149
 
150
+ if USE_GPU:
151
+ print("🔋 Using GPU with 4-bit quantization")
152
  model = AutoModel.from_pretrained(
153
+ MODEL_ID,
154
  device_map="auto",
155
  torch_dtype=torch.float16,
156
  load_in_4bit=True
157
  )
158
  else:
159
+ print("🧠 Using CPU")
160
  model = AutoModel.from_pretrained(
161
+ MODEL_ID,
162
  device_map="cpu",
163
  torch_dtype=torch.float32
164
  )
165
 
166
  model.eval()
167
+ device = next(model.parameters()).device
168
+
169
+ # Schema
170
+ class EmbedRequest(BaseModel):
171
+ texts: List[str]
172
 
173
+ class EmbedResponse(BaseModel):
174
+ embeddings: List[List[float]]
175
 
176
+ # Auth dependency
177
+ async def verify_api_key(x_api_key: Optional[str] = Header(None)):
178
+ if x_api_key != API_KEY:
179
+ raise HTTPException(status_code=401, detail="Invalid or missing API key")
180
 
181
+ # Masked mean pooling
182
+ def masked_mean_pooling(last_hidden_state, attention_mask):
183
+ mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
184
+ masked_embeddings = last_hidden_state * mask
185
+ summed = masked_embeddings.sum(dim=1)
186
+ counts = mask.sum(dim=1)
187
+ return summed / counts.clamp(min=1e-9)
188
+
189
+ # Endpoint
190
+ @app.post("/embed", response_model=EmbedResponse, dependencies=[Depends(verify_api_key)])
191
+ async def embed_texts(request: EmbedRequest):
192
+ inputs = tokenizer(
193
+ request.texts,
194
+ return_tensors="pt",
195
+ padding=True,
196
+ truncation=True,
197
+ max_length=32768
198
+ )
199
  inputs = {k: v.to(device) for k, v in inputs.items()}
200
 
201
  with torch.no_grad():
202
  outputs = model(**inputs)
203
+ pooled = masked_mean_pooling(outputs.last_hidden_state, inputs['attention_mask'])
204
+ normalized = F.normalize(pooled, p=2, dim=1)
 
 
 
205
 
206
+ return {"embeddings": normalized.cpu().tolist()}
207
 
208