first commit
Browse files- 1_Pooling/config.json +10 -0
- LICENCE +96 -0
- README.md +122 -0
- README_JA.md +129 -0
- config.json +29 -0
- config_sentence_transformers.json +10 -0
- model.safetensors +3 -0
- modules.json +14 -0
- sentence_bert_config.json +4 -0
- special_tokens_map.json +51 -0
- tokenizer.json +0 -0
- tokenizer.model +3 -0
- tokenizer_config.json +171 -0
1_Pooling/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 1792,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": false,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": true,
|
9 |
+
"include_prompt": false
|
10 |
+
}
|
LICENCE
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Sarashina Model NonCommercial License Agreement
|
2 |
+
Released Date by SB Intuitions Corp.: November 8, 2024
|
3 |
+
|
4 |
+
This Sarashina Model NonCommercial License Agreement (this "Agreement") is a legal agreement between Licensee (as defined in Article 1) and SB Intuitions Corp. ("SB Intuitions"), and governs Licensee's use of the Models (as defined in Article 1) that SB Intuitions provides to Licensee under this Agreement.
|
5 |
+
|
6 |
+
The Models released under this Agreement are intended to be used under this Agreement and contribute to the development of AI technologies.
|
7 |
+
|
8 |
+
By using, reproducing, modifying, distributing, performing or displaying any portion or element of the Model or Derivative Models, or otherwise accepting the terms of this Agreement, you agree to be bound by this Agreement. If you do not agree to this Agreement, you may not download the Model or Derivative Models.
|
9 |
+
|
10 |
+
1. Definition
|
11 |
+
i. "Model" means the deliverables including machine learning models, software, learnt weights, algorithms, parameters and configuration files provided by SB Intuitions and shared under this Agreement.
|
12 |
+
ii. "Output Data" means the text, images, sounds, or other information output of the Model that results from operating or otherwise using the Model.
|
13 |
+
iii. "Derivative Model" means all modifications to the Model, or deliverables created based on the Model, and any other machine learning models which have been trained, tuned, or improved by using Output Data. For clarification, Output Data is not the Derivative Model.
|
14 |
+
iv. "Licensee" means the person or entity exercising the rights granted by this Agreement.
|
15 |
+
v. "Affiliate" means any entity that now or hereafter controls, is controlled by, or is under common control with, SB Intuitions Corp. For the purposes of this definition, "control" shall mean the direct or indirect ownership over or the legal right or power to, directly or indirectly, direct or alter the direction of the management and affairs of the entity.
|
16 |
+
vi. "License" means the rights granted by Licensee under Article 2 (Right)ii of this Agreement.
|
17 |
+
vii. "Intellectual Property Rights" means copyrights, patents, utility model rights, design rights, trademarks and other intellectual property rights including the right to acquire such rights or to apply for the registration, etc. of such rights.
|
18 |
+
|
19 |
+
2. Right
|
20 |
+
i. Condition
|
21 |
+
Any use, reproduction, modification, distribution, performance or display of the Model or Derivative Models shall be subject to the following conditions:
|
22 |
+
(a) Upon renewal of the License, Licensee shall agree to comply with the renewed License or to cease using, copying, modifying, distributing, performing or displaying the Model or Derivative Models.
|
23 |
+
(b) Licensee shall comply with the redistribution regulations set out in Article 3.
|
24 |
+
(c) Licensee shall comply with the prohibition regulations set out in Article 6.
|
25 |
+
(d) Licensee shall not use, copy, modify, distribute, perform or display the Model for commercial purpose (except for research and study purpose).
|
26 |
+
ii. Grant of License
|
27 |
+
Subject to Licensee's compliance with this Agreement, SB Intuitions grants Licensee a non-exclusive and non-transferable license to use, copy, distribute, modify, perform or display the Model. This license also includes the creation of Derivative Models from the Model.
|
28 |
+
iii. Reservation of Rights
|
29 |
+
(a) SB Intuitions reserves all rights, interests and remedies in and to the Model or Derivatives Models by SB Intuitions, and no other rights are granted to Licensee except as expressly set forth in this Agreement.
|
30 |
+
(b) Licensee reserves the copyright to all Derivative Models created by Licensee.
|
31 |
+
(c) SB Intuitions does not claim any rights in the Output Data generated from the Model or Derivative Models. To the extent permitted by law, SB Intuitions acknowledges that the Output Data belongs to Licensee. Licensee is responsible for the use of Output Data.
|
32 |
+
|
33 |
+
3. Redistribution
|
34 |
+
i. Condition of Redistribution
|
35 |
+
Licensee may only reproduce or redistribute the Model or Derivative Models if Licensee complies with all the conditions set out in Article 2, as well as the followings:
|
36 |
+
(a) Licensee shall provide all third-party recipients of the Model or Model Derivatives with a copy of this Agreement;
|
37 |
+
(b) Licensee shall, in all copies of the Model or Derivative Models that Licensee distributes, retain the following attribution notice within a “LICENSE” text file distributed as a part of such copies:
|
38 |
+
"Sarashina is licensed under the Sarashina Model NonCommercial License Agreement, Copyright ©SB Intuitions Corp. All Rights Reserved.";
|
39 |
+
(c) If Licensee distributes the Model or Derivative Models, Licensee shall also include "Sarashina" at the beginning of the name of any such Model or Derivative Models;
|
40 |
+
(d) If Licensee uses the Model or Derivative Models as a component of the system, Licensee shall clearly indicate “Built with Sarashina” in relevant materials of the system such as documents, user interfaces, and/or websites.
|
41 |
+
|
42 |
+
4. Trademark
|
43 |
+
Nothing in this Agreement grant the Licensee permission to use the trade names, trademarks, service marks, or product names owned by SB Intuitions. However, the use within customary scope or the necessity to implement the content of this Agreement, such as quoting the deliverables, describing the origin of the deliverables and reproducing the content of the LICENSE file, is permitted.
|
44 |
+
|
45 |
+
5. Termination and Changes of the License
|
46 |
+
i. Litigation
|
47 |
+
If Licensee institutes a lawsuit (including a cross-claim or counterclaim in a lawsuit) or other legal proceeding against any party in connection with the Model or Derivative Models, this Agreement will terminate as of the date such lawsuit is filed.
|
48 |
+
ii. Changes to the License
|
49 |
+
SB Intuitions may revise this Agreement at any time, for security and/or safety reasons, to adapt to new technologies, to comply with legal and regulatory requirements, or for any other reasons. Upon the revision, SB Intuitions will replace this Agreement on the posted sites.
|
50 |
+
iii. Breach of the License
|
51 |
+
SB Intuitions may terminate this Agreement if Licensee is in breach of any term or condition of this Agreement. Upon termination of this Agreement, Licensee shall delete and cease use of the Model and any Derivative Model.
|
52 |
+
iv. Separability
|
53 |
+
If any part or provision of this Agreement is held invalid, illegal or unenforceable, the validity, legality, and enforceability of the remaining provisions shall not in any way be affected or impaired thereby.
|
54 |
+
|
55 |
+
6. Prohibited activities
|
56 |
+
Licensee shall not engage in any of the prohibited activities set out in the "Sarashina Prohibited Activities Policy".
|
57 |
+
|
58 |
+
7. DISCLAIMER
|
59 |
+
i. FUNCTIONALITY AND PERFORMANCE
|
60 |
+
THE MODEL IS PROVIDED ON AN “AS IS” BASIS. SB INTUITIONS MAKES NO WARRANTY AS TO FUNCTIONALITY OR PERFORMANCE.
|
61 |
+
ii. SUITABILITY
|
62 |
+
SB INTUITIONS MAKES NO WARRANTY, EXPRESSED OR IMPLIED, WITH RESPECT TO THE MODEL OR DERIVATIVE MODELS, AS TO ANY MATTER INCLUDING BUT NOT LIMITED TO, NON-INFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY RIGHTS AND OTHER RIGHTS, MERCHANTABILITY, INTEGRITY, USABILITY, AND COMPATIBILITY FOR ANY PARTICULAR PURPOSE.
|
63 |
+
iii. APPROPRIATENESS OF USE OR REDISTRIBUTION
|
64 |
+
THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE MODEL OR DERIVATIVE MODELS SHALL BE DETERMINED BY THE LICENSEE WHO SHALL BEAR FULL RESPONSIBILITY FOR SUCH DETERMINATION AND ACTIONS.
|
65 |
+
iv. TECHNICAL SERVICES
|
66 |
+
SB INTUITIONS HAS NO OBLIGATION TO PROVIDE LICENSEE WITH ANY TECHNICAL SERVICES WITH RESPECT TO THE MODEL OR ANY DERIVATIVE MODELS.
|
67 |
+
|
68 |
+
8. LIMITATION OF LIABILITY
|
69 |
+
SB INTUITIONS (INCLUDING ITS AFFILIATES FOR THE PURPOSE OF THIS ARTICLE 8) SHALL NOT BE LIABLE UNDER ANY THEORY OF LIABILITY ARISING FROM THIS AGREEMENT, WHETHER IN AN ACTION BASED ON CONTRACT, TORT, NEGLIGENCE, PRODUCT LIABILITY OR OTHERWISE, EXCEPT IN CASES OF WILFUL MISCONDUCT OR GROSS NEGLIGNECE BY SB INTUITIONS, OR WHERE LIABILITY IS MANDATED BY LAW.
|
70 |
+
|
71 |
+
9. Compensation
|
72 |
+
Licensee shall indemnify and hold SB Intuitions (including its Affiliate for the purpose of this Article 9) harmless from and against all claims, demands, damages, losses, liabilities and expenses including, without limitation, reasonable attorneys' fees and expenses which SB Intuitions may suffer or incur as a result or as a consequence of, arising out of or in connection with the use, duplication, modification, distribution, execution or display of the Model, Derivative Models or Output Data by Licensee.
|
73 |
+
|
74 |
+
10. Trade
|
75 |
+
The Model, Derivative Models or the technology contained therein may be subject to export controls and sanctions laws and regulations of Japan and other jurisdictions (the "Export Controls"). Licensee shall comply with all applicable Export Controls. Licensee shall not access, use, download or send the Model or Derivative Models in any manner that would cause any party to violate any Export Controls.
|
76 |
+
|
77 |
+
11. Elimination of Anti-Social Forces
|
78 |
+
i. Licensee represents and warrants that it is not a gang, a gang member, an individual that left a gang within the last five (5) years, a quasi-member of a gang, a gang affiliate, corporate racketeer, a blackmailer camouflaged as a social movement activist, crime groups specialized in intellectual crimes or any other persons or groups equivalent to these (collectively "Anti-Social Forces") and that it does not fall under any of the following categories, presently and in the future:
|
79 |
+
(a) Licensee has relationships in which Anti-Social Forces are recognized to control the management;
|
80 |
+
(b) Licensee has relationships in which Anti-Social Forces are recognized to be substantially involved in the management;
|
81 |
+
(c) Licensee has relationships in which it is recognized to be engaged in unlawful use of Anti-Social Forces;
|
82 |
+
(d) Licensee provides funds or benefits to Anti-Social Forces; or
|
83 |
+
(e) Licensee’s officers or persons substantially involved in its management have socially reprehensible relationships with Anti-Social Forces.
|
84 |
+
ii. Licensee shall not engage in any of the following actions either by themselves or through a third party:
|
85 |
+
(a) violent demands;
|
86 |
+
(b) unreasonable demands beyond legal liability;
|
87 |
+
(c) threatening actions or use of violence in connection with transactions;
|
88 |
+
(d) spreading rumors, damaging the reputation of the counterparty by using fraudulent means or force, or obstructing the service of the other Party; or
|
89 |
+
(e) any other actions equivalent to any of the preceding items.
|
90 |
+
iii. SB Intuitions shall be entitled to unilaterally terminate this Agreement without giving any notice to Licensee when it is found that Licensee falls under the definition of Licensee or any of the items set forth in paragraph i of this Article 11, or has committed an action that falls under any of the items in paragraph ii of this Article 11, or has made a false declaration with regard to the representation or definitive agreement stated in paragraph i of this Article 11, regardless of any breach attributable to SB Intuitions. SB Intuitions shall not be responsible for any damage caused by the termination in accordance with this paragraph and incurred by Licensee.
|
91 |
+
|
92 |
+
12. Governing Law and Jurisdiction
|
93 |
+
i. Governing Law
|
94 |
+
This Agreement shall be governed by and construed in accordance with the laws of Japan.
|
95 |
+
ii. Jurisdiction
|
96 |
+
Any dispute arising from or in connection with this Agreement shall be subject to the exclusive jurisdiction of the Tokyo District Court in the first instance.
|
README.md
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- ja
|
4 |
+
license_name: sarahina-non-commercial-license
|
5 |
+
license_link: LICENSE
|
6 |
+
base_model:
|
7 |
+
- sbintuitions/sarashina2.2-1b
|
8 |
+
tags:
|
9 |
+
- transformers
|
10 |
+
- sentence-similarity
|
11 |
+
- feature-extraction
|
12 |
+
- sentence-transformers
|
13 |
+
inference: false
|
14 |
+
---
|
15 |
+
|
16 |
+
# Sarashina-Embedding-v2-1B
|
17 |
+
|
18 |
+
**[日本語のREADME/Japanese README](https://huggingface.co/sbintuitions/sarashina-embedding-v2-1b/blob/main/README_JA.md)**
|
19 |
+
|
20 |
+
"Sarashina-Embedding-v2-1B" is a Japanese text embedding model, based on the Japanese LLM "[Sarashina2.2-1B](https://huggingface.co/sbintuitions/sarashina2.2-1b)".
|
21 |
+
We trained this model with multi-stage contrastive learning. We achieved the state-of-the-art average score across 28 datasets in [JMTEB](https://huggingface.co/datasets/sbintuitions/JMTEB) (Japanese Massive Text Embedding Benchmark).(Benchmarked on July 28,2025.)
|
22 |
+
|
23 |
+
This model maps sentences & paragraphs to a 1792-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and other applications.
|
24 |
+
|
25 |
+
## Model Details
|
26 |
+
|
27 |
+
### Model Description
|
28 |
+
|
29 |
+
- **Model Type:** Sentence Transformer
|
30 |
+
- **Base model:** [Sarashina2.2-1B](https://huggingface.co/sbintuitions/sarashina2.2-1b)
|
31 |
+
- **Maximum Sequence Length:** 8,192 tokens
|
32 |
+
- **Output Dimensionality:** 1,792 dimensions
|
33 |
+
- **Similarity Function:** Cosine Similarity
|
34 |
+
- **Language:** Japanese
|
35 |
+
- **License:** [Sarashina Model NonCommercial License Agreement](https://huggingface.co/sbintuitions/sarashina-embedding-v2-1b/blob/main/LICENSE)
|
36 |
+
|
37 |
+
### Full Model Architecture
|
38 |
+
|
39 |
+
```
|
40 |
+
SentenceTransformer(
|
41 |
+
(0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: LlamaModel
|
42 |
+
(1): Pooling({'word_embedding_dimension': 1792, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': True, 'include_prompt': False})
|
43 |
+
)
|
44 |
+
```
|
45 |
+
|
46 |
+
## Usage
|
47 |
+
|
48 |
+
First install the [Sentence Transformers](https://github.com/UKPLab/sentence-transformers) library:
|
49 |
+
|
50 |
+
```bash
|
51 |
+
pip install sentence-transformers==4.0.2
|
52 |
+
```
|
53 |
+
|
54 |
+
Then you can load this model and run inference.
|
55 |
+
|
56 |
+
```python
|
57 |
+
from sentence_transformers import SentenceTransformer
|
58 |
+
|
59 |
+
# Download from the 🤗 Hub
|
60 |
+
model = SentenceTransformer("sbintuitions/sarashina-embedding-v2-1b")
|
61 |
+
# Run inference
|
62 |
+
query = [
|
63 |
+
'task: クエリを与えるので、与えられたWeb検索クエリに答える関連文章を検索してください。\nquery: Sarashinaのテキスト埋め込みモデルはありますか?'
|
64 |
+
]
|
65 |
+
texts = [
|
66 |
+
'text: 更級日記は、平安時代中期に菅原孝標女によって書かれた回想録です。',
|
67 |
+
'text: Sarashinaは、SB Intuitionsが開発した日本語大規模言語モデルです。これまでに7B, 13B, 70B, 8x70Bのモデルが公開されています。',
|
68 |
+
'text: サラシナエンベディングは日本語言語モデルをベースにした日本語埋め込みモデルです。'
|
69 |
+
]
|
70 |
+
query_embedding = model.encode(query)
|
71 |
+
text_embeddings = model.encode(texts)
|
72 |
+
# Get the similarity scores between the embeddings
|
73 |
+
similarities = model.similarity(query_embedding, text_embeddings)
|
74 |
+
print(similarities)
|
75 |
+
# tensor([[0.7403, 0.8651, 0.8775]])
|
76 |
+
```
|
77 |
+
### How to add instructions and prefixes
|
78 |
+
|
79 |
+
For both the query and document sides, use different prefix formats. On the query side, add the prefix `task:` followed by instructions. (*Only for STS tasks: The document side should also use the same format of instruction and prefix as the query side.)
|
80 |
+
|
81 |
+
- Query Side: ```task: {Instrcution}\nquery: {Query}```
|
82 |
+
- Document Side: ```text: {Document}```
|
83 |
+
|
84 |
+
### Templates for instructions and prefixes
|
85 |
+
|
86 |
+
The table below provides instruction and prefix templates for five main tasks.
|
87 |
+
|Task|Query Side|Document Side|
|
88 |
+
|:-:|:-|:-|
|
89 |
+
|Retrieval<br>Reranking|task: 質問を与えるので、その質問に答えるのに役立つ関連文書を検索してください。\nquery: |text: |
|
90 |
+
|Clustering|task: 与えられたドキュメントのトピックまたはテーマを特定してください。\nquery: |text: |
|
91 |
+
|Classification|task: 与えられたレビューを適切な評価カテゴリに分類してください。\nquery: |text: |
|
92 |
+
|STS|task: クエリを与えるので,もっともクエリに意味が似ている一節を探してください。\nquery: |task: クエリを与えるので,もっともクエリに意味が似ている一節を探してください。\nquery: |
|
93 |
+
|
94 |
+
## Training
|
95 |
+
|
96 |
+
Sarashina-Embedding-v2-1B is created through the following three-stage learning process:
|
97 |
+
|
98 |
+
### Stage 1: Weakly-supervised Learning
|
99 |
+
To build a general-purpose and high-performance embedding model for a wide range of domains, we employed contrastive learning using weak supervision data, which consists of our own web-crawled data and open datasets.
|
100 |
+
|
101 |
+
### Step2: Supervised Fine-tuning
|
102 |
+
To further train the model to better understand the similarity between queries and documents, we performed fine-tuning using higher-quality data than that used in Stage 1. Additionally, we trained multiple models by modifying parts of the data.
|
103 |
+
|
104 |
+
### Stage 3: Model Merging
|
105 |
+
To enhance performance, we merged the weights of the two models that yielded the highest JMTEB scores in Stage 2 through linear merging.
|
106 |
+
|
107 |
+
## Evaluation Results with [JMTEB](https://huggingface.co/datasets/sbintuitions/JMTEB)
|
108 |
+
|
109 |
+
|Model|Avg.|Retrieval|STS|Classfification|Reranking|Clustering|
|
110 |
+
|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
|
111 |
+
|Sarashina-Embedding-v2-1B(This model)|76.38|76.48|84.22|77.14|86.28|52.56|
|
112 |
+
|[Ruri-v3-310m](https://huggingface.co/cl-nagoya/ruri-v3-310m)|75.85|76.03|81.59|77.65|85.84|50.52|
|
113 |
+
|[Sarashina-Embedding-v1-1B](https://huggingface.co/sbintuitions/sarashina-embedding-v1-1b)|74.87|74.53|81.71|77.20|84.36|50.30|
|
114 |
+
|text-embedding-3-large|73.86|71.95|82.52|77.27|83.06|51.82|
|
115 |
+
|
116 |
+
(*) Evaluated on July 28, 2025.
|
117 |
+
|
118 |
+
## License
|
119 |
+
|
120 |
+
This model is licensed under [Sarashina Model NonCommercial License Agreement](https://huggingface.co/sbintuitions/sarashina-embedding-v2-1b/blob/main/LICENSE).
|
121 |
+
|
122 |
+
**If you are interested in using this model for commercial purposes, please feel free to contact us through our [contact page](https://www.sbintuitions.co.jp/#contact).**
|
README_JA.md
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- ja
|
4 |
+
license_name: sarahina-non-commercial-license
|
5 |
+
license_link: LICENSE
|
6 |
+
base_model:
|
7 |
+
- sbintuitions/sarashina2.2-1b
|
8 |
+
tags:
|
9 |
+
- transformers
|
10 |
+
- sentence-similarity
|
11 |
+
- feature-extraction
|
12 |
+
- sentence-transformers
|
13 |
+
inference: false
|
14 |
+
---
|
15 |
+
|
16 |
+
# Sarashina-Embedding-v2-1B
|
17 |
+
|
18 |
+
「Sarashina-Embedding-v2-1b」は、日本語LLM「[Sarashina2.2-1B](https://huggingface.co/sbintuitions/sarashina2.2-1b)」をベースにした日本語テキスト埋め込みモデルです。
|
19 |
+
|
20 |
+
このモデルは、マルチステージの対照学習で訓練され、 [JMTEB](https://huggingface.co/datasets/sbintuitions/JMTEB) (Japanese Massive Text Embedding Benchmark)の28個のデータセットの平均で、最高水準の平均スコア(2025/07/28時点)を達成しました。
|
21 |
+
|
22 |
+
このモデルは、文や文章を1792次元の高密度ベクトル空間にマッピングし、意味的テキスト類似度、意味的検索、paraphrase mining、テキスト分類、クラスタリングなどに使用できます。
|
23 |
+
|
24 |
+
## モデル詳細
|
25 |
+
|
26 |
+
### モデル説明
|
27 |
+
|
28 |
+
- **モデルタイプ:** Sentence Transformer
|
29 |
+
- **ベースモデル:** [Sarashina2.2-1B](https://huggingface.co/sbintuitions/sarashina2.2-1b)
|
30 |
+
- **最大シーケンス長:** 8,192トークン
|
31 |
+
- **出力次元数:** 1,792次元
|
32 |
+
- **類似度関数:** コサイン類似度
|
33 |
+
- **言語:** 日本語
|
34 |
+
- **ライセンス:** [Sarashina Model NonCommercial License Agreement](https://huggingface.co/sbintuitions/sarashina-embedding-v2-1b/blob/main/LICENSE)
|
35 |
+
|
36 |
+
### モデルアーキテクチャ
|
37 |
+
|
38 |
+
```
|
39 |
+
SentenceTransformer(
|
40 |
+
(0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: LlamaModel
|
41 |
+
(1): Pooling({'word_embedding_dimension': 1792, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': True, 'include_prompt': False})
|
42 |
+
)
|
43 |
+
```
|
44 |
+
|
45 |
+
## 使用方法
|
46 |
+
|
47 |
+
まず、[Sentence Transformers](https://github.com/UKPLab/sentence-transformers)ライブラリをインストールします。
|
48 |
+
|
49 |
+
```bash
|
50 |
+
pip install sentence-transformers==4.0.2
|
51 |
+
```
|
52 |
+
|
53 |
+
次に、このモデルをロードし、推論を実行します。
|
54 |
+
|
55 |
+
```python
|
56 |
+
from sentence_transformers import SentenceTransformer
|
57 |
+
|
58 |
+
# 🤗 Hubからモデルをダウンロードする
|
59 |
+
from sentence_transformers import SentenceTransformer
|
60 |
+
|
61 |
+
model = SentenceTransformer("sbintuitions/sarashina-embedding-v2-1b")
|
62 |
+
# 推論を実行する
|
63 |
+
query = [
|
64 |
+
'task: クエリを与えるので、与えられたWeb検索クエリに答える関連文章を検索してください。\nquery: Sarashinaのテキスト埋め込みモデルはありますか?'
|
65 |
+
]
|
66 |
+
texts = [
|
67 |
+
'text: 更級日記は、平安時代中期に菅原孝標女によって書かれた回想録です。',
|
68 |
+
'text: Sarashinaは、SB Intuitionsが開発した日本語大規模言語モデルです。これまでに7B, 13B, 70B, 8x70Bのモデルが公開されています。',
|
69 |
+
'text: サラシナエンベディングは日本語言語モデルをベースにした日本語埋め込みモデルです。'
|
70 |
+
]
|
71 |
+
query_embedding = model.encode(query)
|
72 |
+
text_embeddings = model.encode(texts)
|
73 |
+
|
74 |
+
# 埋め込みの類似度スコアを取得する
|
75 |
+
similarities = model.similarity(query_embedding, text_embeddings)
|
76 |
+
print(similarities)
|
77 |
+
# tensor([[0.7403, 0.8651, 0.8775]])
|
78 |
+
```
|
79 |
+
|
80 |
+
### instrcution,prefixの付け方
|
81 |
+
|
82 |
+
クエリ側とドキュメント側で形式の異なるprefixをつけ、クエリ側には```task:```というprefixの後に指示を付与します。(*ただしSTSタスクのみドキュメント側にもクエリ側と同じ形式のinstrcutionとprefixを使用します)
|
83 |
+
- クエリ側: ```task: {インストラクション}\nquery: {クエリ}```
|
84 |
+
- ドキュメント側: ```text: {ドキュメント}```
|
85 |
+
|
86 |
+
|
87 |
+
### instrcution,prefixのテンプレート
|
88 |
+
|
89 |
+
テキスト埋め込みモデルが用いられる5つの主なタスクで使用できるinstrcutionとprefixのテンプレートを下記の表に示します。
|
90 |
+
|
91 |
+
|タスク|クエリ側|ドキュメント側|
|
92 |
+
|:-:|:-|:-|
|
93 |
+
|Retrieval<br>Reranking|task: 質問を与えるので、その質問に答えるのに役立つ関連文書を検索してください。\nquery: |text: |
|
94 |
+
|Clustering|task: 与えられたドキュメントのトピックまたはテーマを特定してください。\nquery: |text: |
|
95 |
+
|Classification|task: 与えられたレビューを適切な評価カテゴリに分類してください。\nquery: |text: |
|
96 |
+
|STS|task: クエリを与えるので,もっともクエリに意味が似ている一節を探してください。\nquery: |task: クエリを与えるので,もっともクエリに意味が似ている一節を探してください。\nquery: |
|
97 |
+
|
98 |
+
## 学習
|
99 |
+
|
100 |
+
Sarashina-Embedding-v2-1Bは、以下の3段階の学習ステージによって学習されています。
|
101 |
+
|
102 |
+
### Stage 1: 弱教師あり学習
|
103 |
+
|
104 |
+
幅広い���メインに対して汎用的かつ高い性能を持つ埋め込みモデルを構築するため、独自のwebクロールデータとオープンデータで構成された弱教師データによる対照学習を行いました。
|
105 |
+
|
106 |
+
### Stage 2: ファインチューニング
|
107 |
+
|
108 |
+
より正確なクエリ-ドキュメント間の類似度をモデルに学習させるために、Stage 1で使用したデータよりも高品質なデータを用いてファインチューニングを行いました。
|
109 |
+
またデータの一部を変更して複数のモデルを学習しました。
|
110 |
+
|
111 |
+
### Stage 3: モデルマージ
|
112 |
+
Stage 2においてJMTEBのスコアが最も高かった2つのモデルの重みを線形マージすることで性能向上を図りました。
|
113 |
+
|
114 |
+
## [JMTEB](https://huggingface.co/datasets/sbintuitions/JMTEB)による評価
|
115 |
+
|
116 |
+
|Model|Avg.|Retrieval|STS|Classfification|Reranking|Clustering|
|
117 |
+
|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
|
118 |
+
|Sarashina-Embedding-v2-1B(This model)|76.38|76.48|84.22|77.14|86.28|52.56|
|
119 |
+
|[Ruri-v3-310m](https://huggingface.co/cl-nagoya/ruri-v3-310m)|75.85|76.03|81.59|77.65|85.84|50.52|
|
120 |
+
|[Sarashina-Embedding-v1-1B](https://huggingface.co/sbintuitions/sarashina-embedding-v1-1b)|74.87|74.53|81.71|77.20|84.36|50.30|
|
121 |
+
|text-embedding-3-large|73.86|71.95|82.52|77.27|83.06|51.82|
|
122 |
+
|
123 |
+
(*) 評価日:2025.07.28
|
124 |
+
|
125 |
+
## ライセンス
|
126 |
+
|
127 |
+
このモデルは[Sarashina Model NonCommercial License Agreement](https://huggingface.co/sbintuitions/sarashina-embedding-v2-1b/blob/main/LICENSE)に基づいて公開されています。
|
128 |
+
|
129 |
+
**もしこのモデルの商用利用にご興味がある場合は、お気軽に[コンタクトページ](https://www.sbintuitions.co.jp/#contact)へご連絡ください。**
|
config.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LlamaModel"
|
4 |
+
],
|
5 |
+
"attention_bias": false,
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"bos_token_id": 1,
|
8 |
+
"eos_token_id": 2,
|
9 |
+
"head_dim": 112,
|
10 |
+
"hidden_act": "silu",
|
11 |
+
"hidden_size": 1792,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 6272,
|
14 |
+
"max_position_embeddings": 8192,
|
15 |
+
"mlp_bias": false,
|
16 |
+
"model_type": "llama",
|
17 |
+
"num_attention_heads": 16,
|
18 |
+
"num_hidden_layers": 24,
|
19 |
+
"num_key_value_heads": 8,
|
20 |
+
"pretraining_tp": 1,
|
21 |
+
"rms_norm_eps": 1e-05,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rope_theta": 500000,
|
24 |
+
"tie_word_embeddings": false,
|
25 |
+
"torch_dtype": "float32",
|
26 |
+
"transformers_version": "4.50.3",
|
27 |
+
"use_cache": true,
|
28 |
+
"vocab_size": 102400
|
29 |
+
}
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "4.0.2",
|
4 |
+
"transformers": "4.50.3",
|
5 |
+
"pytorch": "2.6.0+cu126"
|
6 |
+
},
|
7 |
+
"prompts": {},
|
8 |
+
"default_prompt_name": null,
|
9 |
+
"similarity_fn_name": "cosine"
|
10 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be5307e805b214153dd0005aef4b1df97ecca74b2d85e3da67914fa127a7de0a
|
3 |
+
size 4896176272
|
modules.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
}
|
14 |
+
]
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 512,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "<cls>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "</s>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"mask_token": {
|
24 |
+
"content": "<mask>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"pad_token": {
|
31 |
+
"content": "<pad>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "<sep>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
+
"unk_token": {
|
45 |
+
"content": "<unk>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": false,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false
|
50 |
+
}
|
51 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:008293028e1a9d9a1038d9b63d989a2319797dfeaa03f171093a57b33a3a8277
|
3 |
+
size 1831879
|
tokenizer_config.json
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": false,
|
3 |
+
"add_dummy_prefix_space": false,
|
4 |
+
"add_eos_token": true,
|
5 |
+
"add_prefix_space": false,
|
6 |
+
"added_tokens_decoder": {
|
7 |
+
"0": {
|
8 |
+
"content": "<unk>",
|
9 |
+
"lstrip": false,
|
10 |
+
"normalized": false,
|
11 |
+
"rstrip": false,
|
12 |
+
"single_word": false,
|
13 |
+
"special": true
|
14 |
+
},
|
15 |
+
"1": {
|
16 |
+
"content": "<s>",
|
17 |
+
"lstrip": false,
|
18 |
+
"normalized": false,
|
19 |
+
"rstrip": false,
|
20 |
+
"single_word": false,
|
21 |
+
"special": true
|
22 |
+
},
|
23 |
+
"2": {
|
24 |
+
"content": "</s>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false,
|
29 |
+
"special": true
|
30 |
+
},
|
31 |
+
"3": {
|
32 |
+
"content": "<pad>",
|
33 |
+
"lstrip": false,
|
34 |
+
"normalized": false,
|
35 |
+
"rstrip": false,
|
36 |
+
"single_word": false,
|
37 |
+
"special": true
|
38 |
+
},
|
39 |
+
"4": {
|
40 |
+
"content": "<sep>",
|
41 |
+
"lstrip": false,
|
42 |
+
"normalized": false,
|
43 |
+
"rstrip": false,
|
44 |
+
"single_word": false,
|
45 |
+
"special": true
|
46 |
+
},
|
47 |
+
"5": {
|
48 |
+
"content": "<mask>",
|
49 |
+
"lstrip": false,
|
50 |
+
"normalized": false,
|
51 |
+
"rstrip": false,
|
52 |
+
"single_word": false,
|
53 |
+
"special": true
|
54 |
+
},
|
55 |
+
"6": {
|
56 |
+
"content": "<cls>",
|
57 |
+
"lstrip": false,
|
58 |
+
"normalized": false,
|
59 |
+
"rstrip": false,
|
60 |
+
"single_word": false,
|
61 |
+
"special": true
|
62 |
+
},
|
63 |
+
"7": {
|
64 |
+
"content": "<|system|>",
|
65 |
+
"lstrip": false,
|
66 |
+
"normalized": false,
|
67 |
+
"rstrip": false,
|
68 |
+
"single_word": false,
|
69 |
+
"special": false
|
70 |
+
},
|
71 |
+
"8": {
|
72 |
+
"content": "<|assistant|>",
|
73 |
+
"lstrip": false,
|
74 |
+
"normalized": false,
|
75 |
+
"rstrip": false,
|
76 |
+
"single_word": false,
|
77 |
+
"special": false
|
78 |
+
},
|
79 |
+
"9": {
|
80 |
+
"content": "<|user|>",
|
81 |
+
"lstrip": false,
|
82 |
+
"normalized": false,
|
83 |
+
"rstrip": false,
|
84 |
+
"single_word": false,
|
85 |
+
"special": false
|
86 |
+
},
|
87 |
+
"10": {
|
88 |
+
"content": "<|available_tools|>",
|
89 |
+
"lstrip": false,
|
90 |
+
"normalized": false,
|
91 |
+
"rstrip": false,
|
92 |
+
"single_word": false,
|
93 |
+
"special": false
|
94 |
+
},
|
95 |
+
"11": {
|
96 |
+
"content": "<|tool_calls|>",
|
97 |
+
"lstrip": false,
|
98 |
+
"normalized": false,
|
99 |
+
"rstrip": false,
|
100 |
+
"single_word": false,
|
101 |
+
"special": false
|
102 |
+
},
|
103 |
+
"12": {
|
104 |
+
"content": "<|tool_results|>",
|
105 |
+
"lstrip": false,
|
106 |
+
"normalized": false,
|
107 |
+
"rstrip": false,
|
108 |
+
"single_word": false,
|
109 |
+
"special": false
|
110 |
+
},
|
111 |
+
"13": {
|
112 |
+
"content": "<|code|>",
|
113 |
+
"lstrip": false,
|
114 |
+
"normalized": false,
|
115 |
+
"rstrip": false,
|
116 |
+
"single_word": false,
|
117 |
+
"special": false
|
118 |
+
},
|
119 |
+
"14": {
|
120 |
+
"content": "<|file|>",
|
121 |
+
"lstrip": false,
|
122 |
+
"normalized": false,
|
123 |
+
"rstrip": false,
|
124 |
+
"single_word": false,
|
125 |
+
"special": false
|
126 |
+
},
|
127 |
+
"102397": {
|
128 |
+
"content": "<|prefix|>",
|
129 |
+
"lstrip": false,
|
130 |
+
"normalized": false,
|
131 |
+
"rstrip": false,
|
132 |
+
"single_word": false,
|
133 |
+
"special": false
|
134 |
+
},
|
135 |
+
"102398": {
|
136 |
+
"content": "<|suffix|>",
|
137 |
+
"lstrip": false,
|
138 |
+
"normalized": false,
|
139 |
+
"rstrip": false,
|
140 |
+
"single_word": false,
|
141 |
+
"special": false
|
142 |
+
},
|
143 |
+
"102399": {
|
144 |
+
"content": "<|middle|>",
|
145 |
+
"lstrip": false,
|
146 |
+
"normalized": false,
|
147 |
+
"rstrip": false,
|
148 |
+
"single_word": false,
|
149 |
+
"special": false
|
150 |
+
}
|
151 |
+
},
|
152 |
+
"bos_token": "<s>",
|
153 |
+
"clean_up_tokenization_spaces": false,
|
154 |
+
"cls_token": "<cls>",
|
155 |
+
"do_lower_case": false,
|
156 |
+
"eos_token": "</s>",
|
157 |
+
"extra_ids": 0,
|
158 |
+
"extra_special_tokens": {},
|
159 |
+
"keep_accents": true,
|
160 |
+
"legacy": false,
|
161 |
+
"mask_token": "<mask>",
|
162 |
+
"model_max_length": 512,
|
163 |
+
"pad_token": "<pad>",
|
164 |
+
"padding_side": "left",
|
165 |
+
"sep_token": "<sep>",
|
166 |
+
"sp_model_kwargs": {},
|
167 |
+
"spaces_between_special_tokens": false,
|
168 |
+
"tokenizer_class": "LlamaTokenizer",
|
169 |
+
"unk_token": "<unk>",
|
170 |
+
"use_default_system_prompt": false
|
171 |
+
}
|