orionweller commited on
Commit
9e142e5
·
verified ·
1 Parent(s): 27f3f00

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +15 -8
README.md CHANGED
@@ -21,6 +21,13 @@ tags:
21
 
22
  Below is an example to compute the similarity score of a query-document pair
23
  ```python
 
 
 
 
 
 
 
24
  model_name = "jhu-clsp/FollowIR-7B"
25
  model = AutoModelForCausalLM.from_pretrained(
26
  model_name
@@ -32,8 +39,6 @@ tokenizer.pad_token = tokenizer.eos_token
32
  tokenizer.padding_side = "left"
33
  token_false_id = tokenizer.get_vocab()["false"]
34
  token_true_id = tokenizer.get_vocab()["true"]
35
- max_length = min(2048, tokenizer.model_max_length)
36
-
37
  template = """<s> [INST] You are an expert Google searcher, whose job is to determine if the following document is relevant to the query (true/false). Answer using only one word, one of those two choices.
38
 
39
  Query: {query}
@@ -41,32 +46,34 @@ Document: {text}
41
  Relevant (only output one word, either "true" or "false"): [/INST] """
42
 
43
 
 
 
 
 
 
44
  prompts = [
45
- template.format(query=query, text=text) for (query, text) in zip([query] * 2, passages)
46
  ]
47
  tokens = tokenizer(
48
  prompts,
49
  padding=True,
50
  truncation=True,
51
  return_tensors="pt",
52
- max_length=max_length,
53
  pad_to_multiple_of=None,
54
  )
55
 
56
- if "token_type_ids" in tokens:
57
- del tokens["token_type_ids"]
58
-
59
  # move to cuda if desired
60
  for key in tokens:
61
  tokens[key] = tokens[key].cuda()
62
 
63
- # calculate the scores
64
  batch_scores = model(**tokens).logits[:, -1, :]
65
  true_vector = batch_scores[:, token_true_id]
66
  false_vector = batch_scores[:, token_false_id]
67
  batch_scores = torch.stack([false_vector, true_vector], dim=1)
68
  batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
69
  scores = batch_scores[:, 1].exp().tolist()
 
70
  ```
71
 
72
  # Citation
 
21
 
22
  Below is an example to compute the similarity score of a query-document pair
23
  ```python
24
+ from transformers import (
25
+ AutoTokenizer,
26
+ AutoModelForCausalLM,
27
+ )
28
+ import torch
29
+
30
+ # model loading and setup
31
  model_name = "jhu-clsp/FollowIR-7B"
32
  model = AutoModelForCausalLM.from_pretrained(
33
  model_name
 
39
  tokenizer.padding_side = "left"
40
  token_false_id = tokenizer.get_vocab()["false"]
41
  token_true_id = tokenizer.get_vocab()["true"]
 
 
42
  template = """<s> [INST] You are an expert Google searcher, whose job is to determine if the following document is relevant to the query (true/false). Answer using only one word, one of those two choices.
43
 
44
  Query: {query}
 
46
  Relevant (only output one word, either "true" or "false"): [/INST] """
47
 
48
 
49
+ ## Lets define some example queries with instructions in the query and the passage
50
+ query1 = "What movies were written by James Cameron? A relevant document would describe a movie that was written by James Cameron only and not with anyone else"
51
+ query2 = "What movies were directed by James Cameron? A relevant document would describe any movie that was directed by James Cameron"
52
+ passages = ["Avatar: The Way of Water is a 2022 American epic science fiction film co-produced and directed by James Cameron, who co-wrote the screenplay with Rick Jaffa and Amanda Silver from a story the trio wrote with Josh Friedman and Shane Salerno. Distributed by 20th Century Studios, it is the sequel to Avatar (2009) and the second installment in the Avatar film series."] * 2
53
+
54
  prompts = [
55
+ template.format(query=query, text=text) for (query, text) in zip([query1, query2], passages)
56
  ]
57
  tokens = tokenizer(
58
  prompts,
59
  padding=True,
60
  truncation=True,
61
  return_tensors="pt",
 
62
  pad_to_multiple_of=None,
63
  )
64
 
 
 
 
65
  # move to cuda if desired
66
  for key in tokens:
67
  tokens[key] = tokens[key].cuda()
68
 
69
+ # calculate the scores by comparing true and false tokens
70
  batch_scores = model(**tokens).logits[:, -1, :]
71
  true_vector = batch_scores[:, token_true_id]
72
  false_vector = batch_scores[:, token_false_id]
73
  batch_scores = torch.stack([false_vector, true_vector], dim=1)
74
  batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
75
  scores = batch_scores[:, 1].exp().tolist()
76
+ print(scores) # [0.0020704232156276703, 0.9999990463256836] first document is not relevant, as expected
77
  ```
78
 
79
  # Citation