nan commited on
Commit
bfadc62
·
verified ·
1 Parent(s): a5838bd

docs-update-readme-0624 (#23)

Browse files

- docs: cherry-pick README from pr/17 e3e8a244 (40aa64361d8a8d16ff9335f623d8fb1d33ee5aa6)
- feat: add .gitignore (a188cd19ef4c2612fc8882f18621e9c82ffebd7d)
- docs: update the transformers and API codes (69ac66d5b083672544fb01ebc97227740153f190)
- docs: update the tech report link (3061fd752721844bbfaf7bb7a566d36ce28e6c06)
- docs: fix the code snippets (5a1b238231f4de5b3d9bdb5c8d9e74eae6883d60)

Files changed (2) hide show
  1. .gitignore +73 -0
  2. README.md +303 -58
.gitignore ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ venv/
25
+ env/
26
+ ENV/
27
+ .env
28
+ .venv
29
+ env.bak/
30
+ venv.bak/
31
+
32
+ # IDE
33
+ .idea/
34
+ .vscode/
35
+ *.swp
36
+ *.swo
37
+ .project
38
+ .pydevproject
39
+ .settings/
40
+
41
+ # Jupyter Notebook
42
+ .ipynb_checkpoints
43
+ *.ipynb
44
+
45
+ # Distribution / packaging
46
+ .Python
47
+ *.manifest
48
+ *.spec
49
+
50
+ # Unit test / coverage reports
51
+ htmlcov/
52
+ .tox/
53
+ .coverage
54
+ .coverage.*
55
+ .cache
56
+ nosetests.xml
57
+ coverage.xml
58
+ *.cover
59
+ .hypothesis/
60
+
61
+ # Logs and databases
62
+ *.log
63
+ *.sqlite
64
+ *.db
65
+
66
+ # OS generated files
67
+ .DS_Store
68
+ .DS_Store?
69
+ ._*
70
+ .Spotlight-V100
71
+ .Trashes
72
+ ehthumbs.db
73
+ Thumbs.db
README.md CHANGED
@@ -1,92 +1,337 @@
1
- # Jina Embeddings V4
2
 
 
 
 
3
 
4
- ## Examples
5
 
6
- Encode functions:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- ```python
9
- import torch
10
- from transformers import AutoModel
11
- from PIL import Image
12
 
13
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
14
 
15
- # Load model
16
- model = AutoModel.from_pretrained('jinaai/jina-embeddings-v4', trust_remote_code=True)
17
- model = model.to(device)
18
 
19
- # Sample data
20
- texts = ["Here is some sample code", "This is a matching text"]
21
- image_paths = ['/<path_to_image>']
22
- images = [Image.open(path) for path in image_paths]
23
 
24
- # Example 1: Text matching task with single vector embeddings
25
- # Generate embeddings with dimension truncation (256), decrease max_pixels
26
- img_embeddings = model.encode_images(images=images, truncate_dim=256, max_pixels=602112, task='text-matching')
27
- text_embeddings = model.encode_texts(texts=texts, truncate_dim=256, max_length=512, task='text-matching')
28
 
29
- # Example 2: Retrieval task with multi-vector embeddings
30
- model.set_task(task='retrieval')
 
 
31
 
32
- # Generate multi-vector embeddings
33
- img_embeddings = model.encode_images(images=images, vector_type='multi_vector')
34
- text_embeddings = model.encode_texts(texts=texts, vector_type='multi_vector', prompt_name='passage')
 
 
 
 
 
 
35
 
36
- # Example 3: Code task with single vector embeddings
37
- code = ["def hello_world():\n print('Hello, World!')"]
38
- code_embeddings = model.encode_texts(texts=code, task='code')
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  ```
41
 
42
- Using the model forward:
 
 
 
43
 
44
  ```python
 
 
 
45
  import torch
46
- from transformers import AutoModel, AutoProcessor
47
- from PIL import Image
48
 
49
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
50
 
51
- # Load model and processor
52
- model = AutoModel.from_pretrained('jinaai/jina-embeddings-v4', trust_remote_code=True)
53
- model = model.to(device)
54
- processor = AutoProcessor.from_pretrained('jinaai/jina-embeddings-v4', trust_remote_code=True)
55
 
 
 
 
 
56
 
57
- # Sample data
58
- texts = ["Here is some sample code", "This is a matching text"]
59
- image_paths = ['/<path_to_image>']
 
 
 
60
 
61
- # Process text and images
62
- text_batch = processor.process_texts(texts=texts, prefix="Query", max_length=512)
63
- images = [Image.open(path) for path in image_paths]
64
- image_batch = processor.process_images(images=images)
 
 
 
 
65
 
66
- # Forward pass
67
- model.eval()
68
- with torch.no_grad():
69
- text_batch = {k: v.to(device) for k, v in text_batch.items()}
70
- image_batch = {k: v.to(device) for k, v in image_batch.items()}
71
-
72
- with torch.autocast(device_type='cuda' if torch.cuda.is_available() else 'cpu'):
73
- # Get embeddings
74
- text_embeddings = model.model(**text_batch, task_label='retrieval').single_vec_emb
75
- img_embeddings = model.model(**image_batch, task_label='retrieval').single_vec_emb
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
 
 
 
 
 
 
80
 
81
- Inference via the `SentenceTransformer` library:
 
 
 
 
 
 
 
82
 
 
 
 
83
  ```python
84
  from sentence_transformers import SentenceTransformer
85
 
86
- model = SentenceTransformer(
87
- 'jinaai/jina-embeddings-v4', trust_remote_code=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  )
89
 
90
- emb = model.encode(['Khinkali is the best'], task='retrieval', prompt_name='query')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- ```
 
1
+ <br><br>
2
 
3
+ <p align="center">
4
+ <img src="https://huggingface.co/datasets/jinaai/documentation-images/resolve/main/logo.webp" alt="Jina AI: Your Search Foundation, Supercharged!" width="150px">
5
+ </p>
6
 
 
7
 
8
+ <p align="center">
9
+ <b>The embedding model trained by <a href="https://jina.ai/"><b>Jina AI</b></a>.</b>
10
+ </p>
11
+
12
+ <p align="center">
13
+ <b>Jina Embeddings v4: Multilingual Multimodal Embeddings</b>
14
+ </p>
15
+
16
+
17
+ ## Quick Start
18
+
19
+ [Blog](https://alwaysjudgeabookbyitscover.com/) | [Technical Report](https://arxiv.org/abs/2506.18902) | [API](https://jina.ai/embeddings)
20
+
21
+
22
+ ## Intended Usage & Model Info
23
+ `jina-embeddings-v4` is a multilingual, multimodal embedding model designed for unified representation of text and images.
24
+ The model is specialized for complex document retrieval, including visually rich documents with charts, tables, and illustrations.
25
+ Embeddings produced by `jina-embeddings-v4` serve as the backbone for neural information retrieval and multimodal GenAI applications.
26
+
27
+
28
+ Built based on [Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct), `jina-embeddings-v4` has the following features:
29
+
30
+ - **Unified embeddings** for text, images, and visual documents, supporting both dense (single-vector) and late-interaction (multi-vector) retrieval.
31
+ - **Multilingual support** (20+ languages) and compatibility with a wide range of domains, including technical and visually complex documents.
32
+ - **Task-specific adapters** for retrieval, text matching, and code-related tasks, which can be selected at inference time.
33
+ - **Flexible embedding size**: dense embeddings are 2048 dimensions by default but can be truncated to as low as 128 with minimal performance loss.
34
+
35
+
36
+ Summary of features:
37
+
38
+ | Feature | Jina Embeddings V4 |
39
+ |------------|------------|
40
+ | Base Model | Qwen2.5-VL-3B-Instruct |
41
+ | Supported Tasks | `retrieval`, `text-matching`, `code` |
42
+ | Model DType | BFloat 16 |
43
+ | Max Sequence Length | 32768 |
44
+ | Single-Vector Dimension | 2048 |
45
+ | Multi-Vector Dimension | 128 |
46
+ | Matryoshka dimensions | 128, 256, 512, 1024, 2048 |
47
+ | Pooling Strategy | Mean pooling |
48
+ | Attention Mechanism | FlashAttention2 |
49
+
50
 
 
 
 
 
51
 
52
+ ## Training, Data, Parameters
53
 
54
+ Please refer to our [technical report of jina-embeddings-v4](https://arxiv.org/abs/2506.18902) for the model and training details.
 
 
55
 
 
 
 
 
56
 
57
+ ## Usage
 
 
 
58
 
59
+ <details>
60
+ <summary>Requirements</a></summary>
61
+
62
+ The following Python packages are required:
63
 
64
+ - `transformers>=4.52.0`
65
+ - `torch>=2.6.0`
66
+ - `peft>=0.15.2`
67
+ - `torchvision`
68
+ - `pillow`
69
+
70
+ ### Optional / Recommended
71
+ - **flash-attention**: Installing [flash-attention](https://github.com/Dao-AILab/flash-attention) is recommended for improved inference speed and efficiency, but not mandatory.
72
+ - **sentence-transformers**: If you want to use the model via the `sentence-transformers` interface, install this package as well.
73
 
74
+ </details>
 
 
75
 
76
+
77
+ <details>
78
+ <summary>via <a href="https://jina.ai/embeddings/">Jina AI Embeddings API</a></summary>
79
+
80
+
81
+ ```bash
82
+ curl https://api.jina.ai/v1/embeddings \
83
+ -H "Content-Type: application/json" \
84
+ -H "Authorization: Bearer $JINA_AI_API_TOKEN" \
85
+ -d @- <<EOFEOF
86
+ {
87
+ "model": "jina-embeddings-v4",
88
+ "task": "text-matching",
89
+ "input": [
90
+ {
91
+ "text": "غروب جميل على الشاطئ"
92
+ },
93
+ {
94
+ "text": "海滩上美丽的日落"
95
+ },
96
+ {
97
+ "text": "A beautiful sunset over the beach"
98
+ },
99
+ {
100
+ "text": "Un beau coucher de soleil sur la plage"
101
+ },
102
+ {
103
+ "text": "Ein wunderschöner Sonnenuntergang am Strand"
104
+ },
105
+ {
106
+ "text": "Ένα όμορφο ηλιοβασίλεμα πάνω από την παραλία"
107
+ },
108
+ {
109
+ "text": "समुद्र तट पर एक खूबसूरत सूर्यास्त"
110
+ },
111
+ {
112
+ "text": "Un bellissimo tramonto sulla spiaggia"
113
+ },
114
+ {
115
+ "text": "浜辺に沈む美しい夕日"
116
+ },
117
+ {
118
+ "text": "해변 위로 아름다운 일몰"
119
+ },
120
+ {
121
+ "image": "https://i.ibb.co/nQNGqL0/beach1.jpg"
122
+ },
123
+ {
124
+ "image": "https://i.ibb.co/r5w8hG8/beach2.jpg"
125
+ }
126
+ ]
127
+ }
128
+ EOFEOF
129
  ```
130
 
131
+ </details>
132
+
133
+ <details>
134
+ <summary>via <a href="https://huggingface.co/docs/transformers/en/index">transformers</a></summary>
135
 
136
  ```python
137
+ # !pip install transformers>=4.52.0 torch>=2.6.0 peft>=0.15.2 torchvision pillow
138
+ # !pip install
139
+ from transformers import AutoModel
140
  import torch
 
 
141
 
142
+ # Initialize the model
143
+ model = AutoModel.from_pretrained("jinaai/jina-embeddings-v4", trust_remote_code=True)
144
 
145
+ model.to("cuda")
 
 
 
146
 
147
+ # ========================
148
+ # 1. Retrieval Task
149
+ # ========================
150
+ # Configure truncate_dim, max_length (for texts), max_pixels (for images), vector_type, batch_size in the encode function if needed
151
 
152
+ # Encode query
153
+ query_embeddings = model.encode_text(
154
+ texts=["Overview of climate change impacts on coastal cities"],
155
+ task="retrieval",
156
+ prompt_name="query",
157
+ )
158
 
159
+ # Encode passage (text)
160
+ passage_embeddings = model.encode_text(
161
+ texts=[
162
+ "Climate change has led to rising sea levels, increased frequency of extreme weather events..."
163
+ ],
164
+ task="retrieval",
165
+ prompt_name="passage",
166
+ )
167
 
168
+ # Encode image/document
169
+ image_embeddings = model.encode_image(
170
+ images=["https://i.ibb.co/nQNGqL0/beach1.jpg"],
171
+ task="retrieval",
172
+ )
 
 
 
 
 
173
 
174
+ # ========================
175
+ # 2. Text Matching Task
176
+ # ========================
177
+ texts = [
178
+ "غروب جميل على الشاطئ", # Arabic
179
+ "海滩上美丽的日落", # Chinese
180
+ "Un beau coucher de soleil sur la plage", # French
181
+ "Ein wunderschöner Sonnenuntergang am Strand", # German
182
+ "Ένα όμορφο ηλιοβασίλεμα πάνω από την παραλία", # Greek
183
+ "समुद्र तट पर एक खूबसूरत सूर्यास्त", # Hindi
184
+ "Un bellissimo tramonto sulla spiaggia", # Italian
185
+ "浜辺に沈む美しい夕日", # Japanese
186
+ "해변 위로 아름다운 일몰", # Korean
187
+ ]
188
 
189
+ text_embeddings = model.encode_text(texts=texts, task="text-matching")
190
+
191
+ # ========================
192
+ # 3. Code Understanding Task
193
+ # ========================
194
+
195
+ # Encode query
196
+ query_embedding = model.encode_text(
197
+ texts=["Find a function that prints a greeting message to the console"],
198
+ task="code",
199
+ prompt_name="query",
200
+ )
201
+
202
+ # Encode code
203
+ code_embeddings = model.encode_text(
204
+ texts=["def hello_world():\n print('Hello, World!')"],
205
+ task="code",
206
+ prompt_name="passage",
207
+ )
208
+
209
+ # ========================
210
+ # 4. Use multivectors
211
+ # ========================
212
 
213
+ multivector_embeddings = model.encode_text(
214
+ texts=texts,
215
+ task="retrieval",
216
+ prompt_name="query",
217
+ return_multivector=True,
218
+ )
219
 
220
+ images = ["https://i.ibb.co/nQNGqL0/beach1.jpg", "https://i.ibb.co/r5w8hG8/beach2.jpg"]
221
+ multivector_image_embeddings = model.encode_image(
222
+ images=images,
223
+ task="retrieval",
224
+ return_multivector=True,
225
+ )
226
+ ```
227
+ </details>
228
 
229
+ <details>
230
+ <summary>via <a href="https://sbert.net/">sentence-transformers</a></summary>
231
+
232
  ```python
233
  from sentence_transformers import SentenceTransformer
234
 
235
+ # Initialize the model
236
+ model = SentenceTransformer("jinaai/jina-embeddings-v4", trust_remote_code=True)
237
+ # ========================
238
+ # 1. Retrieval Task
239
+ # ========================
240
+ # Encode query
241
+ query_embeddings = model.encode(
242
+ sentences=["Overview of climate change impacts on coastal cities"],
243
+ task="retrieval",
244
+ prompt_name="query",
245
+ )
246
+
247
+ print(f"query_embeddings.shape = {query_embeddings.shape}")
248
+
249
+ # Encode passage (text)
250
+ passage_embeddings = model.encode(
251
+ sentences=[
252
+ "Climate change has led to rising sea levels, increased frequency of extreme weather events..."
253
+ ],
254
+ task="retrieval",
255
+ prompt_name="passage",
256
+ )
257
+
258
+ print(f"passage_embeddings.shape = {passage_embeddings.shape}")
259
+
260
+ # Encode image/document
261
+ image_embeddings = model.encode(
262
+ sentences=["https://i.ibb.co/nQNGqL0/beach1.jpg"],
263
+ task="retrieval",
264
+ )
265
+
266
+ print(f"image_embeddings.shape = {image_embeddings.shape}")
267
+
268
+ # ========================
269
+ # 2. Text Matching Task
270
+ # ========================
271
+ texts = [
272
+ "غروب جميل على الشاطئ", # Arabic
273
+ "海滩上美丽的日落", # Chinese
274
+ "Un beau coucher de soleil sur la plage", # French
275
+ "Ein wunderschöner Sonnenuntergang am Strand", # German
276
+ "Ένα όμορφο ηλιοβασίλεμα πάνω από την παραλία", # Greek
277
+ "समुद्र तट पर एक खूबसूरत सूर्यास्त", # Hindi
278
+ "Un bellissimo tramonto sulla spiaggia", # Italian
279
+ "浜辺に沈む美しい夕日", # Japanese
280
+ "해변 위로 아름다운 일몰", # Korean
281
+ ]
282
+
283
+ text_embeddings = model.encode(sentences=texts, task="text-matching")
284
+
285
+ # ========================
286
+ # 3. Code Understanding Task
287
+ # ========================
288
+
289
+ # Encode query
290
+ query_embeddings = model.encode(
291
+ sentences=["Find a function that prints a greeting message to the console"],
292
+ task="code",
293
+ prompt_name="query",
294
  )
295
 
296
+ # Encode code
297
+ code_embeddings = model.encode(
298
+ sentences=["def hello_world():\n print('Hello, World!')"],
299
+ task="code",
300
+ prompt_name="passage",
301
+ )
302
+
303
+ # ========================
304
+ # 4. Use multivectors
305
+ # ========================
306
+
307
+ multivector_text_embeddings = model.encode(
308
+ sentences=texts,
309
+ task="retrieval",
310
+ prompt_name="query",
311
+ return_multivector=True,
312
+ )
313
+
314
+ images = ["https://i.ibb.co/nQNGqL0/beach1.jpg", "https://i.ibb.co/r5w8hG8/beach2.jpg"]
315
+
316
+ multivector_image_embeddings = model.encode(
317
+ sentences=images,
318
+ task="retrieval",
319
+ return_multivector=True,
320
+ )
321
+ ```
322
+ </details>
323
+
324
+
325
+ ## License
326
+
327
+ This model is licensed to download and run under [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/deed.en). It is available for commercial use via the [Jina Embeddings API](https://jina.ai/embeddings/), [AWS](https://longdogechallenge.com/), [Azure](https://longdogechallenge.com/), and [GCP](https://longdogechallenge.com/). To download for commercial use, please [contact us](https://jina.ai/contact-sales).
328
+
329
+
330
+ ## Contact
331
+
332
+ Join our [Discord community](https://discord.jina.ai) and chat with other community members about ideas.
333
+
334
+
335
+ ## Citation
336
 
337
+ If you find `jina-embeddings-v4` useful in your research, please cite the following paper: