Update README.md
Browse files
README.md
CHANGED
@@ -57,7 +57,7 @@ The following Python packages are required:
|
|
57 |
</details>
|
58 |
|
59 |
<details>
|
60 |
-
<summary>via <a href="https://huggingface.co/docs/transformers/en/index">transformers</a
|
61 |
|
62 |
```python
|
63 |
# !pip install transformers>=4.53.0 torch>=2.7.1
|
@@ -87,6 +87,92 @@ passage_embeddings = model.encode(
|
|
87 |
```
|
88 |
</details>
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
<details>
|
91 |
<summary>via <a href="https://sbert.net/">sentence-transformers</a></summary>
|
92 |
|
@@ -127,6 +213,85 @@ print(similarity)
|
|
127 |
```
|
128 |
</details>
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
## Training & Evaluation
|
131 |
|
132 |
Please refer to our [technical report of jina-code-embeddings](https://arxiv.org/abs/2508.21290) for training details and benchmarks.
|
|
|
57 |
</details>
|
58 |
|
59 |
<details>
|
60 |
+
<summary>via <a href="https://huggingface.co/docs/transformers/en/index">transformers</a> (AutoModel with trust_remote_code=True)</summary>
|
61 |
|
62 |
```python
|
63 |
# !pip install transformers>=4.53.0 torch>=2.7.1
|
|
|
87 |
```
|
88 |
</details>
|
89 |
|
90 |
+
<details>
|
91 |
+
<summary> via <a href="https://huggingface.co/docs/transformers/en/index">transformers</a> (using Qwen2Model without trust_remote_code)</summary>
|
92 |
+
|
93 |
+
```python
|
94 |
+
# !pip install transformers>=4.53.0 torch>=2.7.1
|
95 |
+
|
96 |
+
import torch
|
97 |
+
import torch.nn.functional as F
|
98 |
+
|
99 |
+
from transformers.models.qwen2 import Qwen2Model
|
100 |
+
from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast
|
101 |
+
|
102 |
+
INSTRUCTION_CONFIG = {
|
103 |
+
"nl2code": {
|
104 |
+
"query": "Find the most relevant code snippet given the following query:\n",
|
105 |
+
"passage": "Candidate code snippet:\n"
|
106 |
+
},
|
107 |
+
"qa": {
|
108 |
+
"query": "Find the most relevant answer given the following question:\n",
|
109 |
+
"passage": "Candidate answer:\n"
|
110 |
+
},
|
111 |
+
"code2code": {
|
112 |
+
"query": "Find an equivalent code snippet given the following code snippet:\n",
|
113 |
+
"passage": "Candidate code snippet:\n"
|
114 |
+
},
|
115 |
+
"code2nl": {
|
116 |
+
"query": "Find the most relevant comment given the following code snippet:\n",
|
117 |
+
"passage": "Candidate comment:\n"
|
118 |
+
},
|
119 |
+
"code2completion": {
|
120 |
+
"query": "Find the most relevant completion given the following start of code snippet:\n",
|
121 |
+
"passage": "Candidate completion:\n"
|
122 |
+
}
|
123 |
+
}
|
124 |
+
|
125 |
+
MAX_LENGTH = 8192
|
126 |
+
|
127 |
+
def cosine_similarity(x,y):
|
128 |
+
x = F.normalize(x, p=2, dim=1)
|
129 |
+
y = F.normalize(y, p=2, dim=1)
|
130 |
+
return x @ y.T
|
131 |
+
|
132 |
+
def last_token_pool(last_hidden_states, attention_mask):
|
133 |
+
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
|
134 |
+
if left_padding:
|
135 |
+
return last_hidden_states[:, -1]
|
136 |
+
else:
|
137 |
+
sequence_lengths = attention_mask.sum(dim=1) - 1
|
138 |
+
batch_size = last_hidden_states.shape[0]
|
139 |
+
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
|
140 |
+
|
141 |
+
def add_instruction(instruction, query):
|
142 |
+
return f'{instruction}{query}'
|
143 |
+
|
144 |
+
# The queries and documents to embed
|
145 |
+
queries = [
|
146 |
+
add_instruction(INSTRUCTION_CONFIG["nl2code"]["query"], "print hello world in python"),
|
147 |
+
add_instruction(INSTRUCTION_CONFIG["nl2code"]["query"], "initialize array of 5 zeros in c++")
|
148 |
+
]
|
149 |
+
documents = [
|
150 |
+
add_instruction(INSTRUCTION_CONFIG["nl2code"]["passage"], "print('Hello World!')"),
|
151 |
+
add_instruction(INSTRUCTION_CONFIG["nl2code"]["passage"], "int arr[5] = {0, 0, 0, 0, 0};")
|
152 |
+
]
|
153 |
+
all_inputs = queries + documents
|
154 |
+
|
155 |
+
tokenizer = Qwen2TokenizerFast.from_pretrained('jinaai/jina-code-embeddings-1.5b')
|
156 |
+
model = Qwen2Model.from_pretrained('jinaai/jina-code-embeddings-1.5b')
|
157 |
+
|
158 |
+
batch_dict = tokenizer(
|
159 |
+
all_inputs,
|
160 |
+
padding=True,
|
161 |
+
truncation=True,
|
162 |
+
max_length=MAX_LENGTH,
|
163 |
+
return_tensors="pt",
|
164 |
+
)
|
165 |
+
batch_dict.to(model.device)
|
166 |
+
outputs = model(**batch_dict)
|
167 |
+
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
|
168 |
+
query_embeddings = embeddings[:2]
|
169 |
+
passage_embeddings = embeddings[2:]
|
170 |
+
|
171 |
+
# Compute the (cosine) similarity between the query and document embeddings
|
172 |
+
scores = cosine_similarity(query_embeddings, passage_embeddings)
|
173 |
+
```
|
174 |
+
</details>
|
175 |
+
|
176 |
<details>
|
177 |
<summary>via <a href="https://sbert.net/">sentence-transformers</a></summary>
|
178 |
|
|
|
213 |
```
|
214 |
</details>
|
215 |
|
216 |
+
<details>
|
217 |
+
<summary>via <a href="https://github.com/vllm-project/vllm">vLLM</a></summary>
|
218 |
+
|
219 |
+
```python
|
220 |
+
|
221 |
+
import torch
|
222 |
+
import torch.nn.functional as F
|
223 |
+
from vllm import LLM
|
224 |
+
|
225 |
+
INSTRUCTION_CONFIG = {
|
226 |
+
"nl2code": {
|
227 |
+
"query": "Find the most relevant code snippet given the following query:\n",
|
228 |
+
"passage": "Candidate code snippet:\n"
|
229 |
+
},
|
230 |
+
"qa": {
|
231 |
+
"query": "Find the most relevant answer given the following question:\n",
|
232 |
+
"passage": "Candidate answer:\n"
|
233 |
+
},
|
234 |
+
"code2code": {
|
235 |
+
"query": "Find an equivalent code snippet given the following code snippet:\n",
|
236 |
+
"passage": "Candidate code snippet:\n"
|
237 |
+
},
|
238 |
+
"code2nl": {
|
239 |
+
"query": "Find the most relevant comment given the following code snippet:\n",
|
240 |
+
"passage": "Candidate comment:\n"
|
241 |
+
},
|
242 |
+
"code2completion": {
|
243 |
+
"query": "Find the most relevant completion given the following start of code snippet:\n",
|
244 |
+
"passage": "Candidate completion:\n"
|
245 |
+
}
|
246 |
+
}
|
247 |
+
|
248 |
+
def add_instruction(instruction, text):
|
249 |
+
return f"{instruction}{text}"
|
250 |
+
|
251 |
+
def cosine_similarity(x, y):
|
252 |
+
x = F.normalize(x, p=2, dim=1)
|
253 |
+
y = F.normalize(y, p=2, dim=1)
|
254 |
+
return x @ y.T
|
255 |
+
|
256 |
+
# Build the queries and documents
|
257 |
+
queries = [
|
258 |
+
add_instruction(INSTRUCTION_CONFIG["nl2code"]["query"], "print hello world in python"),
|
259 |
+
add_instruction(INSTRUCTION_CONFIG["nl2code"]["query"], "initialize array of 5 zeros in c++"),
|
260 |
+
]
|
261 |
+
documents = [
|
262 |
+
add_instruction(INSTRUCTION_CONFIG["nl2code"]["passage"], "print('Hello World!')"),
|
263 |
+
add_instruction(INSTRUCTION_CONFIG["nl2code"]["passage"], "int arr[5] = {0, 0, 0, 0, 0};"),
|
264 |
+
]
|
265 |
+
all_inputs = queries + documents
|
266 |
+
|
267 |
+
# vLLM embedding model
|
268 |
+
llm = LLM(
|
269 |
+
model="jinaai/jina-code-embeddings-1.5b",
|
270 |
+
hf_overrides={"architectures": ["Qwen2ForCausalLM"]},
|
271 |
+
task="embed"
|
272 |
+
)
|
273 |
+
|
274 |
+
# Encode with vLLM
|
275 |
+
outputs = llm.encode(all_inputs)
|
276 |
+
|
277 |
+
# Collect embeddings into a single tensor
|
278 |
+
emb_list = []
|
279 |
+
for out in outputs:
|
280 |
+
vec = out.outputs.data.detach()
|
281 |
+
emb_list.append(vec)
|
282 |
+
embeddings = torch.stack(emb_list, dim=0)
|
283 |
+
|
284 |
+
# Split into query and passage embeddings
|
285 |
+
n_q = len(queries)
|
286 |
+
query_embeddings = embeddings[:n_q]
|
287 |
+
passage_embeddings = embeddings[n_q:]
|
288 |
+
|
289 |
+
# Cosine similarity matrix (queries x documents)
|
290 |
+
scores = cosine_similarity(query_embeddings, passage_embeddings)
|
291 |
+
```
|
292 |
+
|
293 |
+
</details>
|
294 |
+
|
295 |
## Training & Evaluation
|
296 |
|
297 |
Please refer to our [technical report of jina-code-embeddings](https://arxiv.org/abs/2508.21290) for training details and benchmarks.
|