bwang0911 commited on
Commit
d6dfd43
·
verified ·
1 Parent(s): 4b7a270

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +27 -5
README.md CHANGED
@@ -115,8 +115,8 @@ def mean_pooling(model_output, attention_mask):
115
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
116
 
117
  sentences = [
118
- 'Save model to a pickle located at `path` with Python please',
119
- 'def save_act(self, path=None): if path is None: path = os.path.join(logger.get_dir(), "model.pkl") with tempfile.TemporaryDirectory() as td: save_variables(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, "w") as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f)',
120
  ]
121
 
122
  tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-code')
@@ -144,12 +144,12 @@ cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
144
  model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code', trust_remote_code=True)
145
  embeddings = model.encode(
146
  [
147
- 'Save model to a pickle located at `path` with Python please',
148
- 'def save_act(self, path=None): if path is None: path = os.path.join(logger.get_dir(), "model.pkl") with tempfile.TemporaryDirectory() as td: save_variables(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, "w") as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f)',
149
  ]
150
  )
151
  print(cos_sim(embeddings[0], embeddings[1]))
152
- >>> 0.7230249
153
  ```
154
 
155
  If you only want to handle shorter sequence, such as 2k, pass the `max_length` parameter to the `encode` function:
@@ -161,6 +161,28 @@ embeddings = model.encode(
161
  )
162
  ```
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  ## Plans
165
 
166
  1. Bilingual embedding models supporting more European & Asian languages, including Spanish, French, Italian and Japanese.
 
115
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
116
 
117
  sentences = [
118
+ 'How do I access the index while iterating over a sequence with a for loop?',
119
+ '# Use the built-in enumerator\nfor idx, x in enumerate(xs):\n print(idx, x)',
120
  ]
121
 
122
  tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-code')
 
144
  model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code', trust_remote_code=True)
145
  embeddings = model.encode(
146
  [
147
+ 'How do I access the index while iterating over a sequence with a for loop?',
148
+ '# Use the built-in enumerator\nfor idx, x in enumerate(xs):\n print(idx, x)',
149
  ]
150
  )
151
  print(cos_sim(embeddings[0], embeddings[1]))
152
+ >>> tensor([[0.7282]])
153
  ```
154
 
155
  If you only want to handle shorter sequence, such as 2k, pass the `max_length` parameter to the `encode` function:
 
161
  )
162
  ```
163
 
164
+ Using the its latest release (v2.3.0) sentence-transformers also supports Jina embeddings (Please make sure that you are logged into huggingface as well):
165
+
166
+ ```python
167
+ !pip install -U sentence-transformers
168
+ from sentence_transformers import SentenceTransformer
169
+ from sentence_transformers.util import cos_sim
170
+
171
+ model = SentenceTransformer(
172
+ "jinaai/jina-embeddings-v2-base-code",
173
+ trust_remote_code=True
174
+ )
175
+
176
+ # control your input sequence length up to 8192
177
+ model.max_seq_length = 1024
178
+
179
+ embeddings = model.encode([
180
+ 'How do I access the index while iterating over a sequence with a for loop?',
181
+ '# Use the built-in enumerator\nfor idx, x in enumerate(xs):\n print(idx, x)',
182
+ ])
183
+ print(cos_sim(embeddings[0], embeddings[1]))
184
+ ```
185
+
186
  ## Plans
187
 
188
  1. Bilingual embedding models supporting more European & Asian languages, including Spanish, French, Italian and Japanese.