michael-guenther commited on
Commit
fa5d48a
·
verified ·
1 Parent(s): 5d9b9c5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +166 -1
README.md CHANGED
@@ -57,7 +57,7 @@ The following Python packages are required:
57
  </details>
58
 
59
  <details>
60
- <summary>via <a href="https://huggingface.co/docs/transformers/en/index">transformers</a></summary>
61
 
62
  ```python
63
  # !pip install transformers>=4.53.0 torch>=2.7.1
@@ -87,6 +87,92 @@ passage_embeddings = model.encode(
87
  ```
88
  </details>
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  <details>
91
  <summary>via <a href="https://sbert.net/">sentence-transformers</a></summary>
92
 
@@ -127,6 +213,85 @@ print(similarity)
127
  ```
128
  </details>
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  ## Training & Evaluation
131
 
132
  Please refer to our [technical report of jina-code-embeddings](https://arxiv.org/abs/2508.21290) for training details and benchmarks.
 
57
  </details>
58
 
59
  <details>
60
+ <summary>via <a href="https://huggingface.co/docs/transformers/en/index">transformers</a> (AutoModel with trust_remote_code=True)</summary>
61
 
62
  ```python
63
  # !pip install transformers>=4.53.0 torch>=2.7.1
 
87
  ```
88
  </details>
89
 
90
+ <details>
91
+ <summary> via <a href="https://huggingface.co/docs/transformers/en/index">transformers</a> (using Qwen2Model without trust_remote_code)</summary>
92
+
93
+ ```python
94
+ # !pip install transformers>=4.53.0 torch>=2.7.1
95
+
96
+ import torch
97
+ import torch.nn.functional as F
98
+
99
+ from transformers.models.qwen2 import Qwen2Model
100
+ from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast
101
+
102
+ INSTRUCTION_CONFIG = {
103
+ "nl2code": {
104
+ "query": "Find the most relevant code snippet given the following query:\n",
105
+ "passage": "Candidate code snippet:\n"
106
+ },
107
+ "qa": {
108
+ "query": "Find the most relevant answer given the following question:\n",
109
+ "passage": "Candidate answer:\n"
110
+ },
111
+ "code2code": {
112
+ "query": "Find an equivalent code snippet given the following code snippet:\n",
113
+ "passage": "Candidate code snippet:\n"
114
+ },
115
+ "code2nl": {
116
+ "query": "Find the most relevant comment given the following code snippet:\n",
117
+ "passage": "Candidate comment:\n"
118
+ },
119
+ "code2completion": {
120
+ "query": "Find the most relevant completion given the following start of code snippet:\n",
121
+ "passage": "Candidate completion:\n"
122
+ }
123
+ }
124
+
125
+ MAX_LENGTH = 8192
126
+
127
+ def cosine_similarity(x,y):
128
+ x = F.normalize(x, p=2, dim=1)
129
+ y = F.normalize(y, p=2, dim=1)
130
+ return x @ y.T
131
+
132
+ def last_token_pool(last_hidden_states, attention_mask):
133
+ left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
134
+ if left_padding:
135
+ return last_hidden_states[:, -1]
136
+ else:
137
+ sequence_lengths = attention_mask.sum(dim=1) - 1
138
+ batch_size = last_hidden_states.shape[0]
139
+ return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
140
+
141
+ def add_instruction(instruction, query):
142
+ return f'{instruction}{query}'
143
+
144
+ # The queries and documents to embed
145
+ queries = [
146
+ add_instruction(INSTRUCTION_CONFIG["nl2code"]["query"], "print hello world in python"),
147
+ add_instruction(INSTRUCTION_CONFIG["nl2code"]["query"], "initialize array of 5 zeros in c++")
148
+ ]
149
+ documents = [
150
+ add_instruction(INSTRUCTION_CONFIG["nl2code"]["passage"], "print('Hello World!')"),
151
+ add_instruction(INSTRUCTION_CONFIG["nl2code"]["passage"], "int arr[5] = {0, 0, 0, 0, 0};")
152
+ ]
153
+ all_inputs = queries + documents
154
+
155
+ tokenizer = Qwen2TokenizerFast.from_pretrained('jinaai/jina-code-embeddings-1.5b')
156
+ model = Qwen2Model.from_pretrained('jinaai/jina-code-embeddings-1.5b')
157
+
158
+ batch_dict = tokenizer(
159
+ all_inputs,
160
+ padding=True,
161
+ truncation=True,
162
+ max_length=MAX_LENGTH,
163
+ return_tensors="pt",
164
+ )
165
+ batch_dict.to(model.device)
166
+ outputs = model(**batch_dict)
167
+ embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
168
+ query_embeddings = embeddings[:2]
169
+ passage_embeddings = embeddings[2:]
170
+
171
+ # Compute the (cosine) similarity between the query and document embeddings
172
+ scores = cosine_similarity(query_embeddings, passage_embeddings)
173
+ ```
174
+ </details>
175
+
176
  <details>
177
  <summary>via <a href="https://sbert.net/">sentence-transformers</a></summary>
178
 
 
213
  ```
214
  </details>
215
 
216
+ <details>
217
+ <summary>via <a href="https://github.com/vllm-project/vllm">vLLM</a></summary>
218
+
219
+ ```python
220
+
221
+ import torch
222
+ import torch.nn.functional as F
223
+ from vllm import LLM
224
+
225
+ INSTRUCTION_CONFIG = {
226
+ "nl2code": {
227
+ "query": "Find the most relevant code snippet given the following query:\n",
228
+ "passage": "Candidate code snippet:\n"
229
+ },
230
+ "qa": {
231
+ "query": "Find the most relevant answer given the following question:\n",
232
+ "passage": "Candidate answer:\n"
233
+ },
234
+ "code2code": {
235
+ "query": "Find an equivalent code snippet given the following code snippet:\n",
236
+ "passage": "Candidate code snippet:\n"
237
+ },
238
+ "code2nl": {
239
+ "query": "Find the most relevant comment given the following code snippet:\n",
240
+ "passage": "Candidate comment:\n"
241
+ },
242
+ "code2completion": {
243
+ "query": "Find the most relevant completion given the following start of code snippet:\n",
244
+ "passage": "Candidate completion:\n"
245
+ }
246
+ }
247
+
248
+ def add_instruction(instruction, text):
249
+ return f"{instruction}{text}"
250
+
251
+ def cosine_similarity(x, y):
252
+ x = F.normalize(x, p=2, dim=1)
253
+ y = F.normalize(y, p=2, dim=1)
254
+ return x @ y.T
255
+
256
+ # Build the queries and documents
257
+ queries = [
258
+ add_instruction(INSTRUCTION_CONFIG["nl2code"]["query"], "print hello world in python"),
259
+ add_instruction(INSTRUCTION_CONFIG["nl2code"]["query"], "initialize array of 5 zeros in c++"),
260
+ ]
261
+ documents = [
262
+ add_instruction(INSTRUCTION_CONFIG["nl2code"]["passage"], "print('Hello World!')"),
263
+ add_instruction(INSTRUCTION_CONFIG["nl2code"]["passage"], "int arr[5] = {0, 0, 0, 0, 0};"),
264
+ ]
265
+ all_inputs = queries + documents
266
+
267
+ # vLLM embedding model
268
+ llm = LLM(
269
+ model="jinaai/jina-code-embeddings-1.5b",
270
+ hf_overrides={"architectures": ["Qwen2ForCausalLM"]},
271
+ task="embed"
272
+ )
273
+
274
+ # Encode with vLLM
275
+ outputs = llm.encode(all_inputs)
276
+
277
+ # Collect embeddings into a single tensor
278
+ emb_list = []
279
+ for out in outputs:
280
+ vec = out.outputs.data.detach()
281
+ emb_list.append(vec)
282
+ embeddings = torch.stack(emb_list, dim=0)
283
+
284
+ # Split into query and passage embeddings
285
+ n_q = len(queries)
286
+ query_embeddings = embeddings[:n_q]
287
+ passage_embeddings = embeddings[n_q:]
288
+
289
+ # Cosine similarity matrix (queries x documents)
290
+ scores = cosine_similarity(query_embeddings, passage_embeddings)
291
+ ```
292
+
293
+ </details>
294
+
295
  ## Training & Evaluation
296
 
297
  Please refer to our [technical report of jina-code-embeddings](https://arxiv.org/abs/2508.21290) for training details and benchmarks.