Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- 1_Pooling/config.json +10 -0
- README.md +448 -0
- config.json +36 -0
- config_sentence_transformers.json +10 -0
- model.safetensors +3 -0
- modules.json +14 -0
- sentence_bert_config.json +4 -0
- sentencepiece.bpe.model +3 -0
- special_tokens_map.json +51 -0
- tokenizer.json +3 -0
- tokenizer_config.json +63 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
1_Pooling/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 384,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": false,
|
9 |
+
"include_prompt": true
|
10 |
+
}
|
README.md
ADDED
@@ -0,0 +1,448 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- sentence-transformers
|
4 |
+
- sentence-similarity
|
5 |
+
- feature-extraction
|
6 |
+
- generated_from_trainer
|
7 |
+
- dataset_size:25743
|
8 |
+
- loss:MultipleNegativesRankingLoss
|
9 |
+
base_model: Lajavaness/bilingual-embedding-small
|
10 |
+
widget:
|
11 |
+
- source_sentence: Luciano Hang da HAVAN, send 200 oxygen cylinders to Manaus in their
|
12 |
+
planes. attitude of man of true, bravo! O
|
13 |
+
sentences:
|
14 |
+
- The photo of Pedro Sánchez «enjoying while Gran Canaria burns»
|
15 |
+
- Havan owner Luciano Hang donated 200 oxygen cylinders to Manaus
|
16 |
+
- Video of the show in Shanghai staged by robots made in China
|
17 |
+
- source_sentence: '"PERSEVERANCE" SEND THE FIRST COLOR IMAGES FROM THE SURFACE OF
|
18 |
+
MARS'
|
19 |
+
sentences:
|
20 |
+
- If an election has 51% of the votes cast, the election is annulled.
|
21 |
+
- This video shows Indian Air Force attack helicopters flying over Pangong Lake
|
22 |
+
in Ladakh.
|
23 |
+
- The first video images of Mars from the Perseverance rover
|
24 |
+
- source_sentence: SPEECH BY PEDRO CASTILLO, IT WAS BASED ON THE HATE OF SPAIN OF
|
25 |
+
A PAST PRE-HISPANIC THAT I ONLY KNOW EXPLAINS FROM THE MOST ABSOLUTE IGNORANCE
|
26 |
+
AND STUPIDITY" KING OF SPAINIn fact, between the president of Colombia, Duque,
|
27 |
+
and the king of Spain, the most regretful of having come to the inauguration of
|
28 |
+
the clown with a hat is the latter.
|
29 |
+
sentences:
|
30 |
+
- Felipe VI said that Pedro Castillo's speech is explained from ignorance and stupidity
|
31 |
+
- '"Population poorly tolerated quarantine and social distancing measures during
|
32 |
+
the Spanish flu, when the first deconfinement took place, abandoning all precautionary
|
33 |
+
measures"'
|
34 |
+
- Genuine photo of Philippine lawmaker Sarah Elago supporting mandatory military
|
35 |
+
training in schools
|
36 |
+
- source_sentence: Australia Day has nothing to do with Captain Cook or Botany Bay
|
37 |
+
The Landing of Captain Cook at the site of Sydney happened on the 28th of April
|
38 |
+
1770 - NOT on the 26th of January 1770. The first fleet arrived in Australia on
|
39 |
+
18 January 1788 and landed at Botany Bay on 20 January 1788. AUSTRALIA DAY CELEBRATES
|
40 |
+
THE DAY ALL AUSTRALIANS STOPPED BEING BRITISH CITIZENS AND BECAME AUSTRALIAN CITIZENS
|
41 |
+
IN 1949. Facts about Australia Day Our Education system and the popular press
|
42 |
+
is not competently advising our children !! Twisting the truth a bit. Don't expect
|
43 |
+
the media to educate you, that's not part of their agenda. Australia Day does
|
44 |
+
not celebrate the arrival of the first fleet or the invasion of anything. The
|
45 |
+
First Fleet arrived in Botany Bay on the 18th of January. However, Captain Cook's
|
46 |
+
landing was included in Australia Day celebrations as a reminder of a significant
|
47 |
+
historical event. Since the extravagant bicentenary celebrations of 1988, when
|
48 |
+
Sydney-siders decided Captain Cook's landing should become the focus of the Australia
|
49 |
+
Day commemoration, the importance of this date for all Australians has begun to
|
50 |
+
fade. Now, a generation later, it's all but lost. This is because our politicians
|
51 |
+
and educators have not been doing a good job promoting the day. Our politicians
|
52 |
+
have not been advertising the real reason for Australia Day, and our educators
|
53 |
+
have not been teaching our children the importance of the 26th of January to all
|
54 |
+
Australians. The media, as usual, is happy to twist the truth for the sake of
|
55 |
+
controversy. In recent years, the media has helped fan the flames of discontent
|
56 |
+
among the Aboriginal community. Many are now so offended by what they see as a
|
57 |
+
celebration of the beginning of the darkest days of Aboriginal history, they want
|
58 |
+
the date changed. Various local Councils are seeking to remove themselves from
|
59 |
+
Australia Day celebrations, even refusing to participate in citizenship ceremonies,
|
60 |
+
and calls are going out to have Australia Day on a different day. The big question
|
61 |
+
is, why has the Government allowed this misconception to continue? Captain Cook
|
62 |
+
didn't land on the 26th of January. So changing the date of any celebration of
|
63 |
+
Captain Cook's landing would not have any impact on Australia Day, but maybe it
|
64 |
+
would clear the way for the truth about Australia Day. The reality is, the Aborigines
|
65 |
+
in this country suffered under the hands of British colonialism. This is as much
|
66 |
+
Australia's history as the landing of the first fleet, and both should be remembered,
|
67 |
+
equally. Both should be taught, side by side, in our schools. Australians of today
|
68 |
+
reject what was done under British governance to the Aborigines. We reject what
|
69 |
+
was done under British governance to the Irish and many other cultures around
|
70 |
+
the world. So, after the horrors of WWII, we decided to fix it. We became our
|
71 |
+
own people. On the 26th of January 1949, the Australian nationality came into
|
72 |
+
existence when the Nationality and Citizenship Act 1948 was enacted. That was
|
73 |
+
the day we were first called Australians and allowed to travel with Passports
|
74 |
+
as Australians. Under the Nationality Act 1920 (Cth), all Aborigines and Torres
|
75 |
+
Strait Islanders born after January 1, 1921, gained the status of British subjects.
|
76 |
+
In 1949, therefore, they automatically became Australian citizens under the Nationality
|
77 |
+
and Citizenship Act 1948. Before that special date, all people living in Australia,
|
78 |
+
including Aborigines born after 1921, were called 'British Subjects' and forced
|
79 |
+
to travel on British Passports and fight in British wars. We all became Australians
|
80 |
+
on the same day! This is why we celebrate Australia Day on the 26th of January!
|
81 |
+
This was the day Australians became free to make our own decisions about which
|
82 |
+
wars we would fight and how our citizens would be treated. It was the day Aborigines
|
83 |
+
were declared Australians. Until this date, Aborigines were not protected by law.
|
84 |
+
For the first time since Cook's landing, this new Act gave Aboriginal Australians
|
85 |
+
by inference and precedent the full protection of Australian law. Because of this
|
86 |
+
Act, the government became free to help Aborigines, and since that day much has
|
87 |
+
been done to assist Aboriginal Australians, including saying 'sorry' for the previous
|
88 |
+
atrocities done before this law came into being. This was a great day for all
|
89 |
+
Australians! This is why the 26th of January is the day new Australians receive
|
90 |
+
their citizenship. It is a day which celebrates the implementation of the Nationality
|
91 |
+
and Citizenship Act of 1948 - the Act which gave freedom and protection to the
|
92 |
+
first Australians and gives all Australians, old and new, the right to live under
|
93 |
+
the protection of Australian Law, united as one nation. Now, isn't that cause
|
94 |
+
for celebration? Education is key! There is a great need for education on the
|
95 |
+
real reason we celebrate Australia Day on the 26th of January. This reason needs
|
96 |
+
to be advertised and taught in schools. We all need to remember this one very
|
97 |
+
special day in Australia's history, when freedom came to all Australians. What
|
98 |
+
was achieved that day is something for which all Australians can be proud! We
|
99 |
+
need to remember both the good and the bad in our history, but the emphasis must
|
100 |
+
be the freedom and unity all Australians now have, because of what was done on
|
101 |
+
the 26th of January 1949, to allow all of us to live without fear in a land of
|
102 |
+
peace. Isn't it time all Australians were taught the real reason we celebrate
|
103 |
+
Australia Day on Jan 26th?
|
104 |
+
sentences:
|
105 |
+
- Australia Day is commemorated on the day when Australian citizenship law passed
|
106 |
+
- Sri Lankan Defense Secretary Kamal Gunarathne praised ex-minister Rishad Bathiudeen
|
107 |
+
in speech
|
108 |
+
- The corona virus is possibly caused due to the use of rhino horn
|
109 |
+
- source_sentence: This is how the buildings moved this morning in Mexico City
|
110 |
+
sentences:
|
111 |
+
- The video captures the earthquake of June 23, 2020
|
112 |
+
- Photo shows bombing in Gaza in January 2022
|
113 |
+
- Photo shows former South African president Jacob Zuma in prison
|
114 |
+
pipeline_tag: sentence-similarity
|
115 |
+
library_name: sentence-transformers
|
116 |
+
---
|
117 |
+
|
118 |
+
# SentenceTransformer based on Lajavaness/bilingual-embedding-small
|
119 |
+
|
120 |
+
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [Lajavaness/bilingual-embedding-small](https://huggingface.co/Lajavaness/bilingual-embedding-small). It maps sentences & paragraphs to a 384-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
|
121 |
+
|
122 |
+
## Model Details
|
123 |
+
|
124 |
+
### Model Description
|
125 |
+
- **Model Type:** Sentence Transformer
|
126 |
+
- **Base model:** [Lajavaness/bilingual-embedding-small](https://huggingface.co/Lajavaness/bilingual-embedding-small) <!-- at revision ed4a1dd814de0db81d4a4e287c296a03194463e3 -->
|
127 |
+
- **Maximum Sequence Length:** 512 tokens
|
128 |
+
- **Output Dimensionality:** 384 dimensions
|
129 |
+
- **Similarity Function:** Cosine Similarity
|
130 |
+
<!-- - **Training Dataset:** Unknown -->
|
131 |
+
<!-- - **Language:** Unknown -->
|
132 |
+
<!-- - **License:** Unknown -->
|
133 |
+
|
134 |
+
### Model Sources
|
135 |
+
|
136 |
+
- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
|
137 |
+
- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
|
138 |
+
- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
|
139 |
+
|
140 |
+
### Full Model Architecture
|
141 |
+
|
142 |
+
```
|
143 |
+
SentenceTransformer(
|
144 |
+
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BilingualModel
|
145 |
+
(1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
146 |
+
)
|
147 |
+
```
|
148 |
+
|
149 |
+
## Usage
|
150 |
+
|
151 |
+
### Direct Usage (Sentence Transformers)
|
152 |
+
|
153 |
+
First install the Sentence Transformers library:
|
154 |
+
|
155 |
+
```bash
|
156 |
+
pip install -U sentence-transformers
|
157 |
+
```
|
158 |
+
|
159 |
+
Then you can load this model and run inference.
|
160 |
+
```python
|
161 |
+
from sentence_transformers import SentenceTransformer
|
162 |
+
|
163 |
+
# Download from the 🤗 Hub
|
164 |
+
model = SentenceTransformer("sentence_transformers_model_id")
|
165 |
+
# Run inference
|
166 |
+
sentences = [
|
167 |
+
'This is how the buildings moved this morning in Mexico City',
|
168 |
+
'The video captures the earthquake of June 23, 2020',
|
169 |
+
'Photo shows bombing in Gaza in January 2022',
|
170 |
+
]
|
171 |
+
embeddings = model.encode(sentences)
|
172 |
+
print(embeddings.shape)
|
173 |
+
# [3, 384]
|
174 |
+
|
175 |
+
# Get the similarity scores for the embeddings
|
176 |
+
similarities = model.similarity(embeddings, embeddings)
|
177 |
+
print(similarities.shape)
|
178 |
+
# [3, 3]
|
179 |
+
```
|
180 |
+
|
181 |
+
<!--
|
182 |
+
### Direct Usage (Transformers)
|
183 |
+
|
184 |
+
<details><summary>Click to see the direct usage in Transformers</summary>
|
185 |
+
|
186 |
+
</details>
|
187 |
+
-->
|
188 |
+
|
189 |
+
<!--
|
190 |
+
### Downstream Usage (Sentence Transformers)
|
191 |
+
|
192 |
+
You can finetune this model on your own dataset.
|
193 |
+
|
194 |
+
<details><summary>Click to expand</summary>
|
195 |
+
|
196 |
+
</details>
|
197 |
+
-->
|
198 |
+
|
199 |
+
<!--
|
200 |
+
### Out-of-Scope Use
|
201 |
+
|
202 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
203 |
+
-->
|
204 |
+
|
205 |
+
<!--
|
206 |
+
## Bias, Risks and Limitations
|
207 |
+
|
208 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
209 |
+
-->
|
210 |
+
|
211 |
+
<!--
|
212 |
+
### Recommendations
|
213 |
+
|
214 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
215 |
+
-->
|
216 |
+
|
217 |
+
## Training Details
|
218 |
+
|
219 |
+
### Training Dataset
|
220 |
+
|
221 |
+
#### Unnamed Dataset
|
222 |
+
|
223 |
+
* Size: 25,743 training samples
|
224 |
+
* Columns: <code>sentence_0</code>, <code>sentence_1</code>, and <code>label</code>
|
225 |
+
* Approximate statistics based on the first 1000 samples:
|
226 |
+
| | sentence_0 | sentence_1 | label |
|
227 |
+
|:--------|:------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------|:--------------------------------------------------------------|
|
228 |
+
| type | string | string | float |
|
229 |
+
| details | <ul><li>min: 2 tokens</li><li>mean: 110.17 tokens</li><li>max: 512 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 19.45 tokens</li><li>max: 190 tokens</li></ul> | <ul><li>min: 1.0</li><li>mean: 1.0</li><li>max: 1.0</li></ul> |
|
230 |
+
* Samples:
|
231 |
+
| sentence_0 | sentence_1 | label |
|
232 |
+
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------|:-----------------|
|
233 |
+
| <code>best music k.m KOSE CELLIE HINS GUINOT SKIN CARE KWhat people fear most is not being physically disabled, but giving up on themselves. There are still many beautiful things in life to aspire to! This stunning performance, known as the American spirit, brought tears to the eyes of 10,000 spectators. Male dancer Babo has been blind since childhood due to a fire in his home. In order to protect him, his mother held him tightly in her arms and jumped from the 7th floor. The mother died as a result, and the little baby became blind due to bleeding from the fundus. His mother was an ice skater before he died, and Babo also had a soft spot for ice skating. Although he couldn't see anything, he still pursued dance enthusiastically. He danced the famous tango "La Cumparsita" with his partner at the World Figure Skating Championships in Helsinki! 1. His ears are like bats that can measure the sound and distance around him. 2. The female dancer is very amazing. She danced with him and led him for...</code> | <code>Performance by a blind American ice dancer</code> | <code>1.0</code> |
|
234 |
+
| <code>Photo from 2016. "Good" times when health was "fine" and the press did not report anything about. Bunch of Hypocrites...Let's go fight my people... . left right not army above all</code> | <code>Photo of a hospital in 2016. Good times when health was "good" and the press didn't report anything about it</code> | <code>1.0</code> |
|
235 |
+
| <code>Haifa Oh Tel Aviv-Yafo Oh N WEST BANK Jerusalem is GAZA STRIPE Be'er Sheva Israel 65 65 35 35 15 M5 10 40Google and Apple maps have officially removed Palestine from the World Maps. Today Palestine was erased from the maps tomorrow Palestine will be erased from the world. PUT PALESTINE BACK ON THE MAP. Please unite now Pakistanio. Enemy is very strong if we are divided. Think just about Pakistan. Support each other, support Pakistan and support your leadership.</code> | <code>Google and Apple removed Palestine from its maps</code> | <code>1.0</code> |
|
236 |
+
* Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
|
237 |
+
```json
|
238 |
+
{
|
239 |
+
"scale": 20.0,
|
240 |
+
"similarity_fct": "cos_sim"
|
241 |
+
}
|
242 |
+
```
|
243 |
+
|
244 |
+
### Training Hyperparameters
|
245 |
+
#### Non-Default Hyperparameters
|
246 |
+
|
247 |
+
- `num_train_epochs`: 1
|
248 |
+
- `multi_dataset_batch_sampler`: round_robin
|
249 |
+
|
250 |
+
#### All Hyperparameters
|
251 |
+
<details><summary>Click to expand</summary>
|
252 |
+
|
253 |
+
- `overwrite_output_dir`: False
|
254 |
+
- `do_predict`: False
|
255 |
+
- `eval_strategy`: no
|
256 |
+
- `prediction_loss_only`: True
|
257 |
+
- `per_device_train_batch_size`: 8
|
258 |
+
- `per_device_eval_batch_size`: 8
|
259 |
+
- `per_gpu_train_batch_size`: None
|
260 |
+
- `per_gpu_eval_batch_size`: None
|
261 |
+
- `gradient_accumulation_steps`: 1
|
262 |
+
- `eval_accumulation_steps`: None
|
263 |
+
- `torch_empty_cache_steps`: None
|
264 |
+
- `learning_rate`: 5e-05
|
265 |
+
- `weight_decay`: 0.0
|
266 |
+
- `adam_beta1`: 0.9
|
267 |
+
- `adam_beta2`: 0.999
|
268 |
+
- `adam_epsilon`: 1e-08
|
269 |
+
- `max_grad_norm`: 1
|
270 |
+
- `num_train_epochs`: 1
|
271 |
+
- `max_steps`: -1
|
272 |
+
- `lr_scheduler_type`: linear
|
273 |
+
- `lr_scheduler_kwargs`: {}
|
274 |
+
- `warmup_ratio`: 0.0
|
275 |
+
- `warmup_steps`: 0
|
276 |
+
- `log_level`: passive
|
277 |
+
- `log_level_replica`: warning
|
278 |
+
- `log_on_each_node`: True
|
279 |
+
- `logging_nan_inf_filter`: True
|
280 |
+
- `save_safetensors`: True
|
281 |
+
- `save_on_each_node`: False
|
282 |
+
- `save_only_model`: False
|
283 |
+
- `restore_callback_states_from_checkpoint`: False
|
284 |
+
- `no_cuda`: False
|
285 |
+
- `use_cpu`: False
|
286 |
+
- `use_mps_device`: False
|
287 |
+
- `seed`: 42
|
288 |
+
- `data_seed`: None
|
289 |
+
- `jit_mode_eval`: False
|
290 |
+
- `use_ipex`: False
|
291 |
+
- `bf16`: False
|
292 |
+
- `fp16`: False
|
293 |
+
- `fp16_opt_level`: O1
|
294 |
+
- `half_precision_backend`: auto
|
295 |
+
- `bf16_full_eval`: False
|
296 |
+
- `fp16_full_eval`: False
|
297 |
+
- `tf32`: None
|
298 |
+
- `local_rank`: 0
|
299 |
+
- `ddp_backend`: None
|
300 |
+
- `tpu_num_cores`: None
|
301 |
+
- `tpu_metrics_debug`: False
|
302 |
+
- `debug`: []
|
303 |
+
- `dataloader_drop_last`: False
|
304 |
+
- `dataloader_num_workers`: 0
|
305 |
+
- `dataloader_prefetch_factor`: None
|
306 |
+
- `past_index`: -1
|
307 |
+
- `disable_tqdm`: False
|
308 |
+
- `remove_unused_columns`: True
|
309 |
+
- `label_names`: None
|
310 |
+
- `load_best_model_at_end`: False
|
311 |
+
- `ignore_data_skip`: False
|
312 |
+
- `fsdp`: []
|
313 |
+
- `fsdp_min_num_params`: 0
|
314 |
+
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
315 |
+
- `fsdp_transformer_layer_cls_to_wrap`: None
|
316 |
+
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
317 |
+
- `deepspeed`: None
|
318 |
+
- `label_smoothing_factor`: 0.0
|
319 |
+
- `optim`: adamw_torch
|
320 |
+
- `optim_args`: None
|
321 |
+
- `adafactor`: False
|
322 |
+
- `group_by_length`: False
|
323 |
+
- `length_column_name`: length
|
324 |
+
- `ddp_find_unused_parameters`: None
|
325 |
+
- `ddp_bucket_cap_mb`: None
|
326 |
+
- `ddp_broadcast_buffers`: False
|
327 |
+
- `dataloader_pin_memory`: True
|
328 |
+
- `dataloader_persistent_workers`: False
|
329 |
+
- `skip_memory_metrics`: True
|
330 |
+
- `use_legacy_prediction_loop`: False
|
331 |
+
- `push_to_hub`: False
|
332 |
+
- `resume_from_checkpoint`: None
|
333 |
+
- `hub_model_id`: None
|
334 |
+
- `hub_strategy`: every_save
|
335 |
+
- `hub_private_repo`: None
|
336 |
+
- `hub_always_push`: False
|
337 |
+
- `gradient_checkpointing`: False
|
338 |
+
- `gradient_checkpointing_kwargs`: None
|
339 |
+
- `include_inputs_for_metrics`: False
|
340 |
+
- `include_for_metrics`: []
|
341 |
+
- `eval_do_concat_batches`: True
|
342 |
+
- `fp16_backend`: auto
|
343 |
+
- `push_to_hub_model_id`: None
|
344 |
+
- `push_to_hub_organization`: None
|
345 |
+
- `mp_parameters`:
|
346 |
+
- `auto_find_batch_size`: False
|
347 |
+
- `full_determinism`: False
|
348 |
+
- `torchdynamo`: None
|
349 |
+
- `ray_scope`: last
|
350 |
+
- `ddp_timeout`: 1800
|
351 |
+
- `torch_compile`: False
|
352 |
+
- `torch_compile_backend`: None
|
353 |
+
- `torch_compile_mode`: None
|
354 |
+
- `dispatch_batches`: None
|
355 |
+
- `split_batches`: None
|
356 |
+
- `include_tokens_per_second`: False
|
357 |
+
- `include_num_input_tokens_seen`: False
|
358 |
+
- `neftune_noise_alpha`: None
|
359 |
+
- `optim_target_modules`: None
|
360 |
+
- `batch_eval_metrics`: False
|
361 |
+
- `eval_on_start`: False
|
362 |
+
- `use_liger_kernel`: False
|
363 |
+
- `eval_use_gather_object`: False
|
364 |
+
- `average_tokens_across_devices`: False
|
365 |
+
- `prompts`: None
|
366 |
+
- `batch_sampler`: batch_sampler
|
367 |
+
- `multi_dataset_batch_sampler`: round_robin
|
368 |
+
|
369 |
+
</details>
|
370 |
+
|
371 |
+
### Training Logs
|
372 |
+
| Epoch | Step | Training Loss |
|
373 |
+
|:------:|:----:|:-------------:|
|
374 |
+
| 0.1554 | 500 | 0.1021 |
|
375 |
+
| 0.3108 | 1000 | 0.0732 |
|
376 |
+
| 0.4661 | 1500 | 0.0781 |
|
377 |
+
| 0.6215 | 2000 | 0.0762 |
|
378 |
+
| 0.7769 | 2500 | 0.0763 |
|
379 |
+
| 0.9323 | 3000 | 0.0739 |
|
380 |
+
| 0.1554 | 500 | 0.0474 |
|
381 |
+
| 0.3108 | 1000 | 0.0478 |
|
382 |
+
| 0.4661 | 1500 | 0.0558 |
|
383 |
+
| 0.6215 | 2000 | 0.0542 |
|
384 |
+
| 0.7769 | 2500 | 0.0457 |
|
385 |
+
| 0.9323 | 3000 | 0.0395 |
|
386 |
+
| 0.1554 | 500 | 0.0055 |
|
387 |
+
| 0.3108 | 1000 | 0.0046 |
|
388 |
+
| 0.4661 | 1500 | 0.0099 |
|
389 |
+
| 0.6215 | 2000 | 0.0157 |
|
390 |
+
| 0.7769 | 2500 | 0.0176 |
|
391 |
+
| 0.9323 | 3000 | 0.0257 |
|
392 |
+
|
393 |
+
|
394 |
+
### Framework Versions
|
395 |
+
- Python: 3.11.11
|
396 |
+
- Sentence Transformers: 3.4.1
|
397 |
+
- Transformers: 4.48.3
|
398 |
+
- PyTorch: 2.5.1+cu124
|
399 |
+
- Accelerate: 1.3.0
|
400 |
+
- Datasets: 3.3.1
|
401 |
+
- Tokenizers: 0.21.0
|
402 |
+
|
403 |
+
## Citation
|
404 |
+
|
405 |
+
### BibTeX
|
406 |
+
|
407 |
+
#### Sentence Transformers
|
408 |
+
```bibtex
|
409 |
+
@inproceedings{reimers-2019-sentence-bert,
|
410 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
411 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
412 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
413 |
+
month = "11",
|
414 |
+
year = "2019",
|
415 |
+
publisher = "Association for Computational Linguistics",
|
416 |
+
url = "https://arxiv.org/abs/1908.10084",
|
417 |
+
}
|
418 |
+
```
|
419 |
+
|
420 |
+
#### MultipleNegativesRankingLoss
|
421 |
+
```bibtex
|
422 |
+
@misc{henderson2017efficient,
|
423 |
+
title={Efficient Natural Language Response Suggestion for Smart Reply},
|
424 |
+
author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
|
425 |
+
year={2017},
|
426 |
+
eprint={1705.00652},
|
427 |
+
archivePrefix={arXiv},
|
428 |
+
primaryClass={cs.CL}
|
429 |
+
}
|
430 |
+
```
|
431 |
+
|
432 |
+
<!--
|
433 |
+
## Glossary
|
434 |
+
|
435 |
+
*Clearly define terms in order to be accessible across audiences.*
|
436 |
+
-->
|
437 |
+
|
438 |
+
<!--
|
439 |
+
## Model Card Authors
|
440 |
+
|
441 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
442 |
+
-->
|
443 |
+
|
444 |
+
<!--
|
445 |
+
## Model Card Contact
|
446 |
+
|
447 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
448 |
+
-->
|
config.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "Lajavaness/bilingual-embedding-small",
|
3 |
+
"architectures": [
|
4 |
+
"BilingualModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"auto_map": {
|
8 |
+
"AutoConfig": "Lajavaness/bilingual-embedding-small--config.BilingualConfig",
|
9 |
+
"AutoModel": "Lajavaness/bilingual-embedding-small--modeling.BilingualModel",
|
10 |
+
"AutoModelForMaskedLM": "dangvantuan/bilingual_impl--modeling.BilingualForMaskedLM",
|
11 |
+
"AutoModelForMultipleChoice": "dangvantuan/bilingual_impl--modeling.BilingualForMultipleChoice",
|
12 |
+
"AutoModelForQuestionAnswering": "dangvantuan/bilingual_impl--modeling.BilingualForQuestionAnswering",
|
13 |
+
"AutoModelForSequenceClassification": "dangvantuan/bilingual_impl--modeling.BilingualForSequenceClassification",
|
14 |
+
"AutoModelForTokenClassification": "dangvantuan/bilingual_impl--modeling.BilingualForTokenClassification"
|
15 |
+
},
|
16 |
+
"bos_token_id": 0,
|
17 |
+
"classifier_dropout": null,
|
18 |
+
"eos_token_id": 2,
|
19 |
+
"hidden_act": "gelu",
|
20 |
+
"hidden_dropout_prob": 0.1,
|
21 |
+
"hidden_size": 384,
|
22 |
+
"initializer_range": 0.02,
|
23 |
+
"intermediate_size": 1536,
|
24 |
+
"layer_norm_eps": 1e-12,
|
25 |
+
"max_position_embeddings": 512,
|
26 |
+
"model_type": "bilingual",
|
27 |
+
"num_attention_heads": 12,
|
28 |
+
"num_hidden_layers": 12,
|
29 |
+
"pad_token_id": 0,
|
30 |
+
"position_embedding_type": "absolute",
|
31 |
+
"torch_dtype": "float32",
|
32 |
+
"transformers_version": "4.48.3",
|
33 |
+
"type_vocab_size": 2,
|
34 |
+
"use_cache": true,
|
35 |
+
"vocab_size": 250037
|
36 |
+
}
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "3.4.1",
|
4 |
+
"transformers": "4.48.3",
|
5 |
+
"pytorch": "2.5.1+cu124"
|
6 |
+
},
|
7 |
+
"prompts": {},
|
8 |
+
"default_prompt_name": null,
|
9 |
+
"similarity_fn_name": "cosine"
|
10 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dc604aba72b29a92f00004e7d148167b6a83edd19bcb2bce9900fa5957bb2387
|
3 |
+
size 470637416
|
modules.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
}
|
14 |
+
]
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 512,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
sentencepiece.bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
3 |
+
size 5069051
|
special_tokens_map.json
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "<s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "</s>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"mask_token": {
|
24 |
+
"content": "<mask>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"pad_token": {
|
31 |
+
"content": "<pad>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "</s>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
+
"unk_token": {
|
45 |
+
"content": "<unk>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": false,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false
|
50 |
+
}
|
51 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef04f2b385d1514f500e779207ace0f53e30895ce37563179e29f4022d28ca38
|
3 |
+
size 17083053
|
tokenizer_config.json
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<s>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<pad>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "</s>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "<unk>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"250001": {
|
36 |
+
"content": "<mask>",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"bos_token": "<s>",
|
45 |
+
"clean_up_tokenization_spaces": true,
|
46 |
+
"cls_token": "<s>",
|
47 |
+
"eos_token": "</s>",
|
48 |
+
"extra_special_tokens": {},
|
49 |
+
"mask_token": "<mask>",
|
50 |
+
"max_length": 512,
|
51 |
+
"model_max_length": 512,
|
52 |
+
"pad_to_multiple_of": null,
|
53 |
+
"pad_token": "<pad>",
|
54 |
+
"pad_token_type_id": 0,
|
55 |
+
"padding_side": "right",
|
56 |
+
"sep_token": "</s>",
|
57 |
+
"sp_model_kwargs": {},
|
58 |
+
"stride": 0,
|
59 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
60 |
+
"truncation_side": "right",
|
61 |
+
"truncation_strategy": "longest_first",
|
62 |
+
"unk_token": "<unk>"
|
63 |
+
}
|