luanafelbarros's picture
Add new SentenceTransformer model
5ebb00e verified
metadata
tags:
  - sentence-transformers
  - sentence-similarity
  - feature-extraction
  - generated_from_trainer
  - dataset_size:234000
  - loss:MSELoss
base_model: google-bert/bert-base-multilingual-uncased
widget:
  - source_sentence: who sings in spite of ourselves with john prine
    sentences:
      - es
      - når ble michael jordan draftet til nba
      - quien canta en spite of ourselves con john prine
  - source_sentence: who wrote when you look me in the eyes
    sentences:
      - متى بدأت الفتاة الكشفية في بيع ملفات تعريف الارتباط
      - A écrit when you look me in the eyes
      - fr
  - source_sentence: when was fathers day made a national holiday
    sentences:
      - wann wurde der Vatertag zum nationalen Feiertag
      - de
      - ' អ្នកណាច្រៀង i want to sing you a love song'
  - source_sentence: what is the density of the continental crust
    sentences:
      - cuál es la densidad de la corteza continental
      - wie zingt i want to sing you a love song
      - es
  - source_sentence: who wrote the song i shot the sheriff
    sentences:
      - Quel est l'âge légal pour consommer du vin au Canada?
      - i shot the sheriff şarkısını kim besteledi
      - tr
pipeline_tag: sentence-similarity
library_name: sentence-transformers
metrics:
  - negative_mse
model-index:
  - name: SentenceTransformer based on google-bert/bert-base-multilingual-uncased
    results:
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to ar
          type: MSE-val-en-to-ar
        metrics:
          - type: negative_mse
            value: -20.37721574306488
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to da
          type: MSE-val-en-to-da
        metrics:
          - type: negative_mse
            value: -17.167489230632782
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to de
          type: MSE-val-en-to-de
        metrics:
          - type: negative_mse
            value: -17.10948944091797
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to en
          type: MSE-val-en-to-en
        metrics:
          - type: negative_mse
            value: -15.333698689937592
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to es
          type: MSE-val-en-to-es
        metrics:
          - type: negative_mse
            value: -16.898061335086823
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to fi
          type: MSE-val-en-to-fi
        metrics:
          - type: negative_mse
            value: -18.428558111190796
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to fr
          type: MSE-val-en-to-fr
        metrics:
          - type: negative_mse
            value: -17.04207956790924
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to he
          type: MSE-val-en-to-he
        metrics:
          - type: negative_mse
            value: -19.942057132720947
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to hu
          type: MSE-val-en-to-hu
        metrics:
          - type: negative_mse
            value: -18.757066130638123
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to it
          type: MSE-val-en-to-it
        metrics:
          - type: negative_mse
            value: -17.18708872795105
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to ja
          type: MSE-val-en-to-ja
        metrics:
          - type: negative_mse
            value: -19.915536046028137
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to ko
          type: MSE-val-en-to-ko
        metrics:
          - type: negative_mse
            value: -21.39919400215149
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to km
          type: MSE-val-en-to-km
        metrics:
          - type: negative_mse
            value: -28.658682107925415
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to ms
          type: MSE-val-en-to-ms
        metrics:
          - type: negative_mse
            value: -17.25209951400757
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to nl
          type: MSE-val-en-to-nl
        metrics:
          - type: negative_mse
            value: -16.605134308338165
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to no
          type: MSE-val-en-to-no
        metrics:
          - type: negative_mse
            value: -17.149969935417175
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to pl
          type: MSE-val-en-to-pl
        metrics:
          - type: negative_mse
            value: -17.846450209617615
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to pt
          type: MSE-val-en-to-pt
        metrics:
          - type: negative_mse
            value: -17.19353199005127
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to ru
          type: MSE-val-en-to-ru
        metrics:
          - type: negative_mse
            value: -18.13419610261917
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to sv
          type: MSE-val-en-to-sv
        metrics:
          - type: negative_mse
            value: -17.13200956583023
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to th
          type: MSE-val-en-to-th
        metrics:
          - type: negative_mse
            value: -26.43084228038788
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to tr
          type: MSE-val-en-to-tr
        metrics:
          - type: negative_mse
            value: -18.183308839797974
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to vi
          type: MSE-val-en-to-vi
        metrics:
          - type: negative_mse
            value: -18.749597668647766
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to zh cn
          type: MSE-val-en-to-zh_cn
        metrics:
          - type: negative_mse
            value: -18.811793625354767
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to zh hk
          type: MSE-val-en-to-zh_hk
        metrics:
          - type: negative_mse
            value: -18.54081153869629
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to zh tw
          type: MSE-val-en-to-zh_tw
        metrics:
          - type: negative_mse
            value: -19.14038509130478
            name: Negative Mse

SentenceTransformer based on google-bert/bert-base-multilingual-uncased

This is a sentence-transformers model finetuned from google-bert/bert-base-multilingual-uncased. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.

Model Details

Model Description

Model Sources

Full Model Architecture

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

Usage

Direct Usage (Sentence Transformers)

First install the Sentence Transformers library:

pip install -U sentence-transformers

Then you can load this model and run inference.

from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("luanafelbarros/bert-base-multilingual-uncased-matryoshka-mkqa")
# Run inference
sentences = [
    'who wrote the song i shot the sheriff',
    'i shot the sheriff şarkısını kim besteledi',
    'tr',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 768]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]

Evaluation

Metrics

Knowledge Distillation

  • Datasets: MSE-val-en-to-ar, MSE-val-en-to-da, MSE-val-en-to-de, MSE-val-en-to-en, MSE-val-en-to-es, MSE-val-en-to-fi, MSE-val-en-to-fr, MSE-val-en-to-he, MSE-val-en-to-hu, MSE-val-en-to-it, MSE-val-en-to-ja, MSE-val-en-to-ko, MSE-val-en-to-km, MSE-val-en-to-ms, MSE-val-en-to-nl, MSE-val-en-to-no, MSE-val-en-to-pl, MSE-val-en-to-pt, MSE-val-en-to-ru, MSE-val-en-to-sv, MSE-val-en-to-th, MSE-val-en-to-tr, MSE-val-en-to-vi, MSE-val-en-to-zh_cn, MSE-val-en-to-zh_hk and MSE-val-en-to-zh_tw
  • Evaluated with MSEEvaluator
Metric MSE-val-en-to-ar MSE-val-en-to-da MSE-val-en-to-de MSE-val-en-to-en MSE-val-en-to-es MSE-val-en-to-fi MSE-val-en-to-fr MSE-val-en-to-he MSE-val-en-to-hu MSE-val-en-to-it MSE-val-en-to-ja MSE-val-en-to-ko MSE-val-en-to-km MSE-val-en-to-ms MSE-val-en-to-nl MSE-val-en-to-no MSE-val-en-to-pl MSE-val-en-to-pt MSE-val-en-to-ru MSE-val-en-to-sv MSE-val-en-to-th MSE-val-en-to-tr MSE-val-en-to-vi MSE-val-en-to-zh_cn MSE-val-en-to-zh_hk MSE-val-en-to-zh_tw
negative_mse -20.3772 -17.1675 -17.1095 -15.3337 -16.8981 -18.4286 -17.0421 -19.9421 -18.7571 -17.1871 -19.9155 -21.3992 -28.6587 -17.2521 -16.6051 -17.15 -17.8465 -17.1935 -18.1342 -17.132 -26.4308 -18.1833 -18.7496 -18.8118 -18.5408 -19.1404

Training Details

Training Dataset

Unnamed Dataset

  • Size: 234,000 training samples
  • Columns: english, non-english, target, and label
  • Approximate statistics based on the first 1000 samples:
    english non-english target label
    type string string string list
    details
    • min: 10 tokens
    • mean: 11.48 tokens
    • max: 16 tokens
    • min: 3 tokens
    • mean: 13.27 tokens
    • max: 33 tokens
    • min: 3 tokens
    • mean: 3.38 tokens
    • max: 7 tokens
    • size: 768 elements
  • Samples:
    english non-english target label
    who plays hope on days of our lives من الذي يلعب الأمل في أيام حياتنا ar [0.2171212136745453, 0.5138550996780396, 0.5517176389694214, -1.0655105113983154, 1.5853567123413086, ...]
    who plays hope on days of our lives hvem spiller hope i Horton-sagaen da [0.2171212136745453, 0.5138550996780396, 0.5517176389694214, -1.0655105113983154, 1.5853567123413086, ...]
    who plays hope on days of our lives Wer spielt die Hope in Zeit der Sehnsucht? de [0.2171212136745453, 0.5138550996780396, 0.5517176389694214, -1.0655105113983154, 1.5853567123413086, ...]
  • Loss: MSELoss

Evaluation Dataset

Unnamed Dataset

  • Size: 13,000 evaluation samples
  • Columns: english, non-english, target, and label
  • Approximate statistics based on the first 1000 samples:
    english non-english target label
    type string string string list
    details
    • min: 10 tokens
    • mean: 11.53 tokens
    • max: 14 tokens
    • min: 3 tokens
    • mean: 13.37 tokens
    • max: 50 tokens
    • min: 3 tokens
    • mean: 3.38 tokens
    • max: 7 tokens
    • size: 768 elements
  • Samples:
    english non-english target label
    who played prudence on nanny and the professor من لعب دور "prudence" فى "nanny and the professor" ar [-0.2837616801261902, -0.4943353235721588, 0.020107418298721313, 0.7796109318733215, -0.47365888953208923, ...]
    who played prudence on nanny and the professor hvem spiller prudence på nanny and the professor da [-0.2837616801261902, -0.4943353235721588, 0.020107418298721313, 0.7796109318733215, -0.47365888953208923, ...]
    who played prudence on nanny and the professor Wer spielte Prudence in Nanny and the Professor de [-0.2837616801261902, -0.4943353235721588, 0.020107418298721313, 0.7796109318733215, -0.47365888953208923, ...]
  • Loss: MSELoss

Training Hyperparameters

Non-Default Hyperparameters

  • eval_strategy: steps
  • per_device_train_batch_size: 64
  • per_device_eval_batch_size: 64
  • learning_rate: 1e-05
  • num_train_epochs: 1
  • warmup_ratio: 0.1
  • fp16: True

All Hyperparameters

Click to expand
  • overwrite_output_dir: False
  • do_predict: False
  • eval_strategy: steps
  • prediction_loss_only: True
  • per_device_train_batch_size: 64
  • per_device_eval_batch_size: 64
  • per_gpu_train_batch_size: None
  • per_gpu_eval_batch_size: None
  • gradient_accumulation_steps: 1
  • eval_accumulation_steps: None
  • torch_empty_cache_steps: None
  • learning_rate: 1e-05
  • weight_decay: 0.0
  • adam_beta1: 0.9
  • adam_beta2: 0.999
  • adam_epsilon: 1e-08
  • max_grad_norm: 1.0
  • num_train_epochs: 1
  • max_steps: -1
  • lr_scheduler_type: linear
  • lr_scheduler_kwargs: {}
  • warmup_ratio: 0.1
  • warmup_steps: 0
  • log_level: passive
  • log_level_replica: warning
  • log_on_each_node: True
  • logging_nan_inf_filter: True
  • save_safetensors: True
  • save_on_each_node: False
  • save_only_model: False
  • restore_callback_states_from_checkpoint: False
  • no_cuda: False
  • use_cpu: False
  • use_mps_device: False
  • seed: 42
  • data_seed: None
  • jit_mode_eval: False
  • use_ipex: False
  • bf16: False
  • fp16: True
  • fp16_opt_level: O1
  • half_precision_backend: auto
  • bf16_full_eval: False
  • fp16_full_eval: False
  • tf32: None
  • local_rank: 0
  • ddp_backend: None
  • tpu_num_cores: None
  • tpu_metrics_debug: False
  • debug: []
  • dataloader_drop_last: False
  • dataloader_num_workers: 0
  • dataloader_prefetch_factor: None
  • past_index: -1
  • disable_tqdm: False
  • remove_unused_columns: True
  • label_names: None
  • load_best_model_at_end: False
  • ignore_data_skip: False
  • fsdp: []
  • fsdp_min_num_params: 0
  • fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
  • fsdp_transformer_layer_cls_to_wrap: None
  • accelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
  • deepspeed: None
  • label_smoothing_factor: 0.0
  • optim: adamw_torch
  • optim_args: None
  • adafactor: False
  • group_by_length: False
  • length_column_name: length
  • ddp_find_unused_parameters: None
  • ddp_bucket_cap_mb: None
  • ddp_broadcast_buffers: False
  • dataloader_pin_memory: True
  • dataloader_persistent_workers: False
  • skip_memory_metrics: True
  • use_legacy_prediction_loop: False
  • push_to_hub: False
  • resume_from_checkpoint: None
  • hub_model_id: None
  • hub_strategy: every_save
  • hub_private_repo: False
  • hub_always_push: False
  • gradient_checkpointing: False
  • gradient_checkpointing_kwargs: None
  • include_inputs_for_metrics: False
  • include_for_metrics: []
  • eval_do_concat_batches: True
  • fp16_backend: auto
  • push_to_hub_model_id: None
  • push_to_hub_organization: None
  • mp_parameters:
  • auto_find_batch_size: False
  • full_determinism: False
  • torchdynamo: None
  • ray_scope: last
  • ddp_timeout: 1800
  • torch_compile: False
  • torch_compile_backend: None
  • torch_compile_mode: None
  • dispatch_batches: None
  • split_batches: None
  • include_tokens_per_second: False
  • include_num_input_tokens_seen: False
  • neftune_noise_alpha: None
  • optim_target_modules: None
  • batch_eval_metrics: False
  • eval_on_start: False
  • use_liger_kernel: False
  • eval_use_gather_object: False
  • average_tokens_across_devices: False
  • prompts: None
  • batch_sampler: batch_sampler
  • multi_dataset_batch_sampler: proportional

Training Logs

Epoch Step Training Loss Validation Loss MSE-val-en-to-ar_negative_mse MSE-val-en-to-da_negative_mse MSE-val-en-to-de_negative_mse MSE-val-en-to-en_negative_mse MSE-val-en-to-es_negative_mse MSE-val-en-to-fi_negative_mse MSE-val-en-to-fr_negative_mse MSE-val-en-to-he_negative_mse MSE-val-en-to-hu_negative_mse MSE-val-en-to-it_negative_mse MSE-val-en-to-ja_negative_mse MSE-val-en-to-ko_negative_mse MSE-val-en-to-km_negative_mse MSE-val-en-to-ms_negative_mse MSE-val-en-to-nl_negative_mse MSE-val-en-to-no_negative_mse MSE-val-en-to-pl_negative_mse MSE-val-en-to-pt_negative_mse MSE-val-en-to-ru_negative_mse MSE-val-en-to-sv_negative_mse MSE-val-en-to-th_negative_mse MSE-val-en-to-tr_negative_mse MSE-val-en-to-vi_negative_mse MSE-val-en-to-zh_cn_negative_mse MSE-val-en-to-zh_hk_negative_mse MSE-val-en-to-zh_tw_negative_mse
0.1367 500 0.3588 - - - - - - - - - - - - - - - - - - - - - - - - - - -
0.2734 1000 0.3078 0.2868 -27.3597 -26.5326 -26.5313 -26.0601 -26.4280 -26.8319 -26.4885 -27.1627 -26.9695 -26.5628 -27.2583 -27.7239 -31.2177 -26.6501 -26.4197 -26.4809 -26.6655 -26.4345 -26.6570 -26.5526 -30.4823 -26.9554 -27.1040 -27.0230 -26.9012 -27.0515
0.4102 1500 0.2846 - - - - - - - - - - - - - - - - - - - - - - - - - - -
0.5469 2000 0.2707 0.2617 -24.6096 -22.8821 -22.8752 -21.8660 -22.7026 -23.6128 -22.7468 -24.2281 -23.6469 -22.9147 -24.3616 -25.2999 -30.4061 -23.0865 -22.5916 -22.8392 -23.1451 -22.7741 -23.2652 -22.9440 -29.2747 -23.5285 -23.8786 -23.6384 -23.5170 -23.8081
0.6836 2500 0.2613 - - - - - - - - - - - - - - - - - - - - - - - - - - -
0.8203 3000 0.2542 0.2491 -23.2261 -21.0314 -20.9970 -19.7599 -20.8388 -21.9791 -20.8374 -22.8299 -22.0605 -21.0367 -22.9281 -24.1290 -29.9238 -21.2195 -20.6506 -20.9939 -21.4204 -20.9651 -21.5594 -21.0815 -28.3947 -21.8046 -22.2153 -21.9866 -21.8474 -22.1930
0.9571 3500 0.248 - - - - - - - - - - - - - - - - - - - - - - - - - - -
1.0938 4000 0.2438 0.2420 -22.4435 -19.9880 -19.9588 -18.5856 -19.7880 -20.9892 -19.8194 -21.9951 -21.1703 -19.9940 -22.1052 -23.3569 -29.5927 -20.1685 -19.5862 -19.9676 -20.4346 -19.9623 -20.6201 -20.0273 -27.9725 -20.8061 -21.2406 -21.0913 -20.9345 -21.3353
1.2305 4500 0.2401 - - - - - - - - - - - - - - - - - - - - - - - - - - -
1.3672 5000 0.2371 0.2373 -21.9444 -19.3005 -19.2441 -17.7989 -19.0868 -20.3950 -19.1305 -21.5127 -20.6068 -19.3250 -21.5673 -22.8791 -29.3793 -19.4702 -18.8669 -19.2886 -19.8258 -19.3057 -20.0101 -19.3345 -27.5779 -20.1899 -20.6284 -20.5167 -20.3229 -20.7721
1.5040 5500 0.2349 - - - - - - - - - - - - - - - - - - - - - - - - - - -
1.6407 6000 0.2336 0.2346 -21.6615 -18.9016 -18.8657 -17.3452 -18.6869 -20.0105 -18.7528 -21.1990 -20.2645 -18.9266 -21.2386 -22.6295 -29.2204 -19.0695 -18.4641 -18.9026 -19.4506 -18.9074 -19.6659 -18.9515 -27.3466 -19.8162 -20.2736 -20.1841 -19.9848 -20.4531
1.7774 6500 0.2319 - - - - - - - - - - - - - - - - - - - - - - - - - - -
1.9141 7000 0.2309 0.2332 -21.5220 -18.7091 -18.6632 -17.1205 -18.4809 -19.8342 -18.5557 -21.0604 -20.0990 -18.7323 -21.0808 -22.4971 -29.1680 -18.8630 -18.2583 -18.6989 -19.2859 -18.7163 -19.4929 -18.7442 -27.2443 -19.6327 -20.1037 -20.0234 -19.8106 -20.3017
0.1367 500 0.2302 - - - - - - - - - - - - - - - - - - - - - - - - - - -
0.2734 1000 0.2261 0.2290 -21.1100 -18.0936 -18.0277 -16.4059 -17.8516 -19.2687 -17.9684 -20.6744 -19.5689 -18.1063 -20.6725 -22.0790 -28.9503 -18.2049 -17.5842 -18.0814 -18.7115 -18.1111 -18.9581 -18.1032 -26.8510 -19.0325 -19.5538 -19.6006 -19.3362 -19.8807
0.4102 1500 0.222 - - - - - - - - - - - - - - - - - - - - - - - - - - -
0.5469 2000 0.2188 0.2246 -20.5835 -17.4530 -17.3853 -15.6663 -17.1929 -18.6930 -17.3208 -20.1688 -19.0165 -17.4784 -20.1460 -21.6056 -28.7345 -17.5632 -16.9100 -17.4263 -18.0993 -17.4835 -18.3902 -17.4462 -26.5854 -18.4647 -19.0091 -19.0492 -18.7904 -19.3776
0.6836 2500 0.2166 - - - - - - - - - - - - - - - - - - - - - - - - - - -
0.8203 3000 0.2148 0.2226 -20.3772 -17.1675 -17.1095 -15.3337 -16.8981 -18.4286 -17.0421 -19.9421 -18.7571 -17.1871 -19.9155 -21.3992 -28.6587 -17.2521 -16.6051 -17.1500 -17.8465 -17.1935 -18.1342 -17.1320 -26.4308 -18.1833 -18.7496 -18.8118 -18.5408 -19.1404
0.9571 3500 0.2133 - - - - - - - - - - - - - - - - - - - - - - - - - - -

Framework Versions

  • Python: 3.10.12
  • Sentence Transformers: 3.3.1
  • Transformers: 4.46.3
  • PyTorch: 2.5.1+cu121
  • Accelerate: 1.1.1
  • Datasets: 3.1.0
  • Tokenizers: 0.20.3

Citation

BibTeX

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

MSELoss

@inproceedings{reimers-2020-multilingual-sentence-bert,
    title = "Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2020",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/2004.09813",
}