jinaai
/

jina-code-embeddings-0.5b

@@ -57,7 +57,7 @@ The following Python packages are required:
 </details>
 <details>
-  <summary>via <a href="https://huggingface.co/docs/transformers/en/index">transformers</a></summary>
 ```python
 # !pip install transformers>=4.53.0 torch>=2.7.1
@@ -87,6 +87,95 @@ passage_embeddings = model.encode(
 ```
 </details>
 <details>
   <summary>via <a href="https://sbert.net/">sentence-transformers</a></summary>

 </details>
 <details>
+  <summary>via <a href="https://huggingface.co/docs/transformers/en/index">transformers</a> (AutoModel with trust_remote_code=True)</summary>
 ```python
 # !pip install transformers>=4.53.0 torch>=2.7.1
 ```
 </details>
+<details>
+  <summary> via <a href="https://huggingface.co/docs/transformers/en/index">transformers</a> (using Qwen2Model without trust_remote_code)</summary>
+```python
+# !pip install transformers>=4.53.0 torch>=2.7.1
+import torch
+import torch.nn.functional as F
+from transformers.models.qwen2 import Qwen2Model
+from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast
+INSTRUCTION_CONFIG = {
+    "nl2code": {
+        "query": "Find the most relevant code snippet given the following query:\n",
+        "passage": "Candidate code snippet:\n"
+    },
+    "qa": {
+        "query": "Find the most relevant answer given the following question:\n",
+        "passage": "Candidate answer:\n"
+    },
+    "code2code": {
+        "query": "Find an equivalent code snippet given the following code snippet:\n",
+        "passage": "Candidate code snippet:\n"
+    },
+    "code2nl": {
+        "query": "Find the most relevant comment given the following code snippet:\n",
+        "passage": "Candidate comment:\n"
+    },
+    "code2completion": {
+        "query": "Find the most relevant completion given the following start of code snippet:\n",
+        "passage": "Candidate completion:\n"
+    }
+}
+MAX_LENGTH = 8192
+def cosine_similarity(x,y):
+    x = F.normalize(x, p=2, dim=1)
+    y = F.normalize(y, p=2, dim=1)
+    return x @ y.T
+def last_token_pool(last_hidden_states, attention_mask):
+    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+    if left_padding:
+        return last_hidden_states[:, -1]
+    else:
+        sequence_lengths = attention_mask.sum(dim=1) - 1
+        batch_size = last_hidden_states.shape[0]
+        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
+def add_instruction(instruction, query):
+    return f'{instruction}{query}'
+# The queries and documents to embed
+queries = [
+    add_instruction(INSTRUCTION_CONFIG["nl2code"]["query"], "print hello world in python"),
+    add_instruction(INSTRUCTION_CONFIG["nl2code"]["query"], "initialize array of 5 zeros in c++")
+]
+documents = [
+    add_instruction(INSTRUCTION_CONFIG["nl2code"]["passage"], "print('Hello World!')"),
+    add_instruction(INSTRUCTION_CONFIG["nl2code"]["passage"], "int arr[5] = {0, 0, 0, 0, 0};")
+]
+all_inputs = queries + documents
+tokenizer = Qwen2TokenizerFast.from_pretrained('jinaai/jina-code-embeddings-0.5b')
+model = Qwen2Model.from_pretrained('jinaai/jina-code-embeddings-0.5b')
+batch_dict = tokenizer(
+    all_inputs,
+    padding=True,
+    truncation=True,
+    max_length=MAX_LENGTH,
+    return_tensors="pt",
+)
+batch_dict.to(model.device)
+outputs = model(**batch_dict)
+embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
+query_embeddings = embeddings[:2]
+passage_embeddings = embeddings[2:]
+# Compute the (cosine) similarity between the query and document embeddings
+scores = cosine_similarity(query_embeddings, passage_embeddings)
+print(scores)
+# tensor([[0.8168, 0.1236],
+#         [0.1204, 0.5525]], grad_fn=<MmBackward0>)
+```
+</details>
 <details>
   <summary>via <a href="https://sbert.net/">sentence-transformers</a></summary>