metadata

tags:
  - sentence-transformers
  - sentence-similarity
  - feature-extraction
  - generated_from_trainer
  - dataset_size:234000
  - loss:MSELoss
base_model: google-bert/bert-base-multilingual-uncased
widget:
  - source_sentence: who sings in spite of ourselves with john prine
    sentences:
      - es
      - når ble michael jordan draftet til nba
      - quien canta en spite of ourselves con john prine
  - source_sentence: who wrote when you look me in the eyes
    sentences:
      - متى بدأت الفتاة الكشفية في بيع ملفات تعريف الارتباط
      - A écrit when you look me in the eyes
      - fr
  - source_sentence: when was fathers day made a national holiday
    sentences:
      - wann wurde der Vatertag zum nationalen Feiertag
      - de
      - ' អ្នកណាច្រៀង i want to sing you a love song'
  - source_sentence: what is the density of the continental crust
    sentences:
      - cuál es la densidad de la corteza continental
      - wie zingt i want to sing you a love song
      - es
  - source_sentence: who wrote the song i shot the sheriff
    sentences:
      - Quel est l'âge légal pour consommer du vin au Canada?
      - i shot the sheriff şarkısını kim besteledi
      - tr
pipeline_tag: sentence-similarity
library_name: sentence-transformers
metrics:
  - negative_mse
model-index:
  - name: SentenceTransformer based on google-bert/bert-base-multilingual-uncased
    results:
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to ar
          type: MSE-val-en-to-ar
        metrics:
          - type: negative_mse
            value: -20.37721574306488
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to da
          type: MSE-val-en-to-da
        metrics:
          - type: negative_mse
            value: -17.167489230632782
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to de
          type: MSE-val-en-to-de
        metrics:
          - type: negative_mse
            value: -17.10948944091797
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to en
          type: MSE-val-en-to-en
        metrics:
          - type: negative_mse
            value: -15.333698689937592
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to es
          type: MSE-val-en-to-es
        metrics:
          - type: negative_mse
            value: -16.898061335086823
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to fi
          type: MSE-val-en-to-fi
        metrics:
          - type: negative_mse
            value: -18.428558111190796
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to fr
          type: MSE-val-en-to-fr
        metrics:
          - type: negative_mse
            value: -17.04207956790924
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to he
          type: MSE-val-en-to-he
        metrics:
          - type: negative_mse
            value: -19.942057132720947
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to hu
          type: MSE-val-en-to-hu
        metrics:
          - type: negative_mse
            value: -18.757066130638123
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to it
          type: MSE-val-en-to-it
        metrics:
          - type: negative_mse
            value: -17.18708872795105
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to ja
          type: MSE-val-en-to-ja
        metrics:
          - type: negative_mse
            value: -19.915536046028137
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to ko
          type: MSE-val-en-to-ko
        metrics:
          - type: negative_mse
            value: -21.39919400215149
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to km
          type: MSE-val-en-to-km
        metrics:
          - type: negative_mse
            value: -28.658682107925415
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to ms
          type: MSE-val-en-to-ms
        metrics:
          - type: negative_mse
            value: -17.25209951400757
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to nl
          type: MSE-val-en-to-nl
        metrics:
          - type: negative_mse
            value: -16.605134308338165
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to no
          type: MSE-val-en-to-no
        metrics:
          - type: negative_mse
            value: -17.149969935417175
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to pl
          type: MSE-val-en-to-pl
        metrics:
          - type: negative_mse
            value: -17.846450209617615
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to pt
          type: MSE-val-en-to-pt
        metrics:
          - type: negative_mse
            value: -17.19353199005127
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to ru
          type: MSE-val-en-to-ru
        metrics:
          - type: negative_mse
            value: -18.13419610261917
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to sv
          type: MSE-val-en-to-sv
        metrics:
          - type: negative_mse
            value: -17.13200956583023
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to th
          type: MSE-val-en-to-th
        metrics:
          - type: negative_mse
            value: -26.43084228038788
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to tr
          type: MSE-val-en-to-tr
        metrics:
          - type: negative_mse
            value: -18.183308839797974
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to vi
          type: MSE-val-en-to-vi
        metrics:
          - type: negative_mse
            value: -18.749597668647766
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to zh cn
          type: MSE-val-en-to-zh_cn
        metrics:
          - type: negative_mse
            value: -18.811793625354767
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to zh hk
          type: MSE-val-en-to-zh_hk
        metrics:
          - type: negative_mse
            value: -18.54081153869629
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to zh tw
          type: MSE-val-en-to-zh_tw
        metrics:
          - type: negative_mse
            value: -19.14038509130478
            name: Negative Mse

SentenceTransformer based on google-bert/bert-base-multilingual-uncased

This is a sentence-transformers model finetuned from google-bert/bert-base-multilingual-uncased. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.

Model Details

Model Description

Model Type: Sentence Transformer
Base model: google-bert/bert-base-multilingual-uncased
Maximum Sequence Length: 128 tokens
Output Dimensionality: 768 dimensions
Similarity Function: Cosine Similarity

Model Sources

Documentation: Sentence Transformers Documentation
Repository: Sentence Transformers on GitHub
Hugging Face: Sentence Transformers on Hugging Face

Full Model Architecture

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

Usage

Direct Usage (Sentence Transformers)

First install the Sentence Transformers library:

pip install -U sentence-transformers

Then you can load this model and run inference.

from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("luanafelbarros/bert-base-multilingual-uncased-matryoshka-mkqa")
# Run inference
sentences = [
    'who wrote the song i shot the sheriff',
    'i shot the sheriff şarkısını kim besteledi',
    'tr',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 768]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]

Evaluation

Metrics

Knowledge Distillation

Datasets: MSE-val-en-to-ar, MSE-val-en-to-da, MSE-val-en-to-de, MSE-val-en-to-en, MSE-val-en-to-es, MSE-val-en-to-fi, MSE-val-en-to-fr, MSE-val-en-to-he, MSE-val-en-to-hu, MSE-val-en-to-it, MSE-val-en-to-ja, MSE-val-en-to-ko, MSE-val-en-to-km, MSE-val-en-to-ms, MSE-val-en-to-nl, MSE-val-en-to-no, MSE-val-en-to-pl, MSE-val-en-to-pt, MSE-val-en-to-ru, MSE-val-en-to-sv, MSE-val-en-to-th, MSE-val-en-to-tr, MSE-val-en-to-vi, MSE-val-en-to-zh_cn, MSE-val-en-to-zh_hk and MSE-val-en-to-zh_tw
Evaluated with MSEEvaluator

Metric	MSE-val-en-to-ar	MSE-val-en-to-da	MSE-val-en-to-de	MSE-val-en-to-en	MSE-val-en-to-es	MSE-val-en-to-fi	MSE-val-en-to-fr	MSE-val-en-to-he	MSE-val-en-to-hu	MSE-val-en-to-it	MSE-val-en-to-ja	MSE-val-en-to-ko	MSE-val-en-to-km	MSE-val-en-to-ms	MSE-val-en-to-nl	MSE-val-en-to-no	MSE-val-en-to-pl	MSE-val-en-to-pt	MSE-val-en-to-ru	MSE-val-en-to-sv	MSE-val-en-to-th	MSE-val-en-to-tr	MSE-val-en-to-vi	MSE-val-en-to-zh_cn	MSE-val-en-to-zh_hk	MSE-val-en-to-zh_tw
negative_mse	-20.3772	-17.1675	-17.1095	-15.3337	-16.8981	-18.4286	-17.0421	-19.9421	-18.7571	-17.1871	-19.9155	-21.3992	-28.6587	-17.2521	-16.6051	-17.15	-17.8465	-17.1935	-18.1342	-17.132	-26.4308	-18.1833	-18.7496	-18.8118	-18.5408	-19.1404

Training Details

Training Dataset

Unnamed Dataset

Size: 234,000 training samples
Columns: english, non-english, target, and label

Approximate statistics based on the first 1000 samples:

	english	non-english	target	label
type	string	string	string	list
details	min: 10 tokens mean: 11.48 tokens max: 16 tokens	min: 3 tokens mean: 13.27 tokens max: 33 tokens	min: 3 tokens mean: 3.38 tokens max: 7 tokens	size: 768 elements

Samples:

english	non-english	target	label
`who plays hope on days of our lives`	`من الذي يلعب الأمل في أيام حياتنا`	`ar`	`[0.2171212136745453, 0.5138550996780396, 0.5517176389694214, -1.0655105113983154, 1.5853567123413086, ...]`
`who plays hope on days of our lives`	`hvem spiller hope i Horton-sagaen`	`da`	`[0.2171212136745453, 0.5138550996780396, 0.5517176389694214, -1.0655105113983154, 1.5853567123413086, ...]`
`who plays hope on days of our lives`	`Wer spielt die Hope in Zeit der Sehnsucht?`	`de`	`[0.2171212136745453, 0.5138550996780396, 0.5517176389694214, -1.0655105113983154, 1.5853567123413086, ...]`

Loss: MSELoss

Evaluation Dataset

Unnamed Dataset

Size: 13,000 evaluation samples
Columns: english, non-english, target, and label

Approximate statistics based on the first 1000 samples:

	english	non-english	target	label
type	string	string	string	list
details	min: 10 tokens mean: 11.53 tokens max: 14 tokens	min: 3 tokens mean: 13.37 tokens max: 50 tokens	min: 3 tokens mean: 3.38 tokens max: 7 tokens	size: 768 elements

Samples:

english	non-english	target	label
`who played prudence on nanny and the professor`	`من لعب دور "prudence" فى "nanny and the professor"`	`ar`	`[-0.2837616801261902, -0.4943353235721588, 0.020107418298721313, 0.7796109318733215, -0.47365888953208923, ...]`
`who played prudence on nanny and the professor`	`hvem spiller prudence på nanny and the professor`	`da`	`[-0.2837616801261902, -0.4943353235721588, 0.020107418298721313, 0.7796109318733215, -0.47365888953208923, ...]`
`who played prudence on nanny and the professor`	`Wer spielte Prudence in Nanny and the Professor`	`de`	`[-0.2837616801261902, -0.4943353235721588, 0.020107418298721313, 0.7796109318733215, -0.47365888953208923, ...]`

Loss: MSELoss

Training Hyperparameters

Non-Default Hyperparameters

eval_strategy: steps
per_device_train_batch_size: 64
per_device_eval_batch_size: 64
learning_rate: 1e-05
num_train_epochs: 1
warmup_ratio: 0.1
fp16: True

All Hyperparameters

Click to expand

overwrite_output_dir: False
do_predict: False
eval_strategy: steps
prediction_loss_only: True
per_device_train_batch_size: 64
per_device_eval_batch_size: 64
per_gpu_train_batch_size: None
per_gpu_eval_batch_size: None
gradient_accumulation_steps: 1
eval_accumulation_steps: None
torch_empty_cache_steps: None
learning_rate: 1e-05
weight_decay: 0.0
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 1e-08
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: -1
lr_scheduler_type: linear
lr_scheduler_kwargs: {}
warmup_ratio: 0.1
warmup_steps: 0
log_level: passive
log_level_replica: warning
log_on_each_node: True
logging_nan_inf_filter: True
save_safetensors: True
save_on_each_node: False
save_only_model: False
restore_callback_states_from_checkpoint: False
no_cuda: False
use_cpu: False
use_mps_device: False
seed: 42
data_seed: None
jit_mode_eval: False
use_ipex: False
bf16: False
fp16: True
fp16_opt_level: O1
half_precision_backend: auto
bf16_full_eval: False
fp16_full_eval: False
tf32: None
local_rank: 0
ddp_backend: None
tpu_num_cores: None
tpu_metrics_debug: False
debug: []
dataloader_drop_last: False
dataloader_num_workers: 0
dataloader_prefetch_factor: None
past_index: -1
disable_tqdm: False
remove_unused_columns: True
label_names: None
load_best_model_at_end: False
ignore_data_skip: False
fsdp: []
fsdp_min_num_params: 0
fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
fsdp_transformer_layer_cls_to_wrap: None
accelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
deepspeed: None
label_smoothing_factor: 0.0
optim: adamw_torch
optim_args: None
adafactor: False
group_by_length: False
length_column_name: length
ddp_find_unused_parameters: None
ddp_bucket_cap_mb: None
ddp_broadcast_buffers: False
dataloader_pin_memory: True
dataloader_persistent_workers: False
skip_memory_metrics: True
use_legacy_prediction_loop: False
push_to_hub: False
resume_from_checkpoint: None
hub_model_id: None
hub_strategy: every_save
hub_private_repo: False
hub_always_push: False
gradient_checkpointing: False
gradient_checkpointing_kwargs: None
include_inputs_for_metrics: False
include_for_metrics: []
eval_do_concat_batches: True
fp16_backend: auto
push_to_hub_model_id: None
push_to_hub_organization: None
mp_parameters:
auto_find_batch_size: False
full_determinism: False
torchdynamo: None
ray_scope: last
ddp_timeout: 1800
torch_compile: False
torch_compile_backend: None
torch_compile_mode: None
dispatch_batches: None
split_batches: None
include_tokens_per_second: False
include_num_input_tokens_seen: False
neftune_noise_alpha: None
optim_target_modules: None
batch_eval_metrics: False
eval_on_start: False
use_liger_kernel: False
eval_use_gather_object: False
average_tokens_across_devices: False
prompts: None
batch_sampler: batch_sampler
multi_dataset_batch_sampler: proportional

Training Logs

Epoch	Step	Training Loss	Validation Loss	MSE-val-en-to-ar_negative_mse	MSE-val-en-to-da_negative_mse	MSE-val-en-to-de_negative_mse	MSE-val-en-to-en_negative_mse	MSE-val-en-to-es_negative_mse	MSE-val-en-to-fi_negative_mse	MSE-val-en-to-fr_negative_mse	MSE-val-en-to-he_negative_mse	MSE-val-en-to-hu_negative_mse	MSE-val-en-to-it_negative_mse	MSE-val-en-to-ja_negative_mse	MSE-val-en-to-ko_negative_mse	MSE-val-en-to-km_negative_mse	MSE-val-en-to-ms_negative_mse	MSE-val-en-to-nl_negative_mse	MSE-val-en-to-no_negative_mse	MSE-val-en-to-pl_negative_mse	MSE-val-en-to-pt_negative_mse	MSE-val-en-to-ru_negative_mse	MSE-val-en-to-sv_negative_mse	MSE-val-en-to-th_negative_mse	MSE-val-en-to-tr_negative_mse	MSE-val-en-to-vi_negative_mse	MSE-val-en-to-zh_cn_negative_mse	MSE-val-en-to-zh_hk_negative_mse	MSE-val-en-to-zh_tw_negative_mse
0.1367	500	0.3588	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.2734	1000	0.3078	0.2868	-27.3597	-26.5326	-26.5313	-26.0601	-26.4280	-26.8319	-26.4885	-27.1627	-26.9695	-26.5628	-27.2583	-27.7239	-31.2177	-26.6501	-26.4197	-26.4809	-26.6655	-26.4345	-26.6570	-26.5526	-30.4823	-26.9554	-27.1040	-27.0230	-26.9012	-27.0515
0.4102	1500	0.2846	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.5469	2000	0.2707	0.2617	-24.6096	-22.8821	-22.8752	-21.8660	-22.7026	-23.6128	-22.7468	-24.2281	-23.6469	-22.9147	-24.3616	-25.2999	-30.4061	-23.0865	-22.5916	-22.8392	-23.1451	-22.7741	-23.2652	-22.9440	-29.2747	-23.5285	-23.8786	-23.6384	-23.5170	-23.8081
0.6836	2500	0.2613	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.8203	3000	0.2542	0.2491	-23.2261	-21.0314	-20.9970	-19.7599	-20.8388	-21.9791	-20.8374	-22.8299	-22.0605	-21.0367	-22.9281	-24.1290	-29.9238	-21.2195	-20.6506	-20.9939	-21.4204	-20.9651	-21.5594	-21.0815	-28.3947	-21.8046	-22.2153	-21.9866	-21.8474	-22.1930
0.9571	3500	0.248	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-
1.0938	4000	0.2438	0.2420	-22.4435	-19.9880	-19.9588	-18.5856	-19.7880	-20.9892	-19.8194	-21.9951	-21.1703	-19.9940	-22.1052	-23.3569	-29.5927	-20.1685	-19.5862	-19.9676	-20.4346	-19.9623	-20.6201	-20.0273	-27.9725	-20.8061	-21.2406	-21.0913	-20.9345	-21.3353
1.2305	4500	0.2401	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-
1.3672	5000	0.2371	0.2373	-21.9444	-19.3005	-19.2441	-17.7989	-19.0868	-20.3950	-19.1305	-21.5127	-20.6068	-19.3250	-21.5673	-22.8791	-29.3793	-19.4702	-18.8669	-19.2886	-19.8258	-19.3057	-20.0101	-19.3345	-27.5779	-20.1899	-20.6284	-20.5167	-20.3229	-20.7721
1.5040	5500	0.2349	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-
1.6407	6000	0.2336	0.2346	-21.6615	-18.9016	-18.8657	-17.3452	-18.6869	-20.0105	-18.7528	-21.1990	-20.2645	-18.9266	-21.2386	-22.6295	-29.2204	-19.0695	-18.4641	-18.9026	-19.4506	-18.9074	-19.6659	-18.9515	-27.3466	-19.8162	-20.2736	-20.1841	-19.9848	-20.4531
1.7774	6500	0.2319	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-
1.9141	7000	0.2309	0.2332	-21.5220	-18.7091	-18.6632	-17.1205	-18.4809	-19.8342	-18.5557	-21.0604	-20.0990	-18.7323	-21.0808	-22.4971	-29.1680	-18.8630	-18.2583	-18.6989	-19.2859	-18.7163	-19.4929	-18.7442	-27.2443	-19.6327	-20.1037	-20.0234	-19.8106	-20.3017
0.1367	500	0.2302	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.2734	1000	0.2261	0.2290	-21.1100	-18.0936	-18.0277	-16.4059	-17.8516	-19.2687	-17.9684	-20.6744	-19.5689	-18.1063	-20.6725	-22.0790	-28.9503	-18.2049	-17.5842	-18.0814	-18.7115	-18.1111	-18.9581	-18.1032	-26.8510	-19.0325	-19.5538	-19.6006	-19.3362	-19.8807
0.4102	1500	0.222	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.5469	2000	0.2188	0.2246	-20.5835	-17.4530	-17.3853	-15.6663	-17.1929	-18.6930	-17.3208	-20.1688	-19.0165	-17.4784	-20.1460	-21.6056	-28.7345	-17.5632	-16.9100	-17.4263	-18.0993	-17.4835	-18.3902	-17.4462	-26.5854	-18.4647	-19.0091	-19.0492	-18.7904	-19.3776
0.6836	2500	0.2166	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.8203	3000	0.2148	0.2226	-20.3772	-17.1675	-17.1095	-15.3337	-16.8981	-18.4286	-17.0421	-19.9421	-18.7571	-17.1871	-19.9155	-21.3992	-28.6587	-17.2521	-16.6051	-17.1500	-17.8465	-17.1935	-18.1342	-17.1320	-26.4308	-18.1833	-18.7496	-18.8118	-18.5408	-19.1404
0.9571	3500	0.2133	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-

Framework Versions

Python: 3.10.12
Sentence Transformers: 3.3.1
Transformers: 4.46.3
PyTorch: 2.5.1+cu121
Accelerate: 1.1.1
Datasets: 3.1.0
Tokenizers: 0.20.3

Citation

BibTeX

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

MSELoss

@inproceedings{reimers-2020-multilingual-sentence-bert,
    title = "Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2020",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/2004.09813",
}