thankrandomness commited on
Commit
5c01f6c
·
1 Parent(s): ef72046

remove similarity_threshold

Browse files
Files changed (1) hide show
  1. app.py +13 -13
app.py CHANGED
@@ -78,7 +78,7 @@ def upsert_data(dataset_split):
78
  upsert_data(dataset['train'])
79
 
80
  # Define retrieval function with similarity threshold
81
- def retrieve_relevant_text(input_text, similarity_threshold=1.0): # Lower threshold to capture more results
82
  input_embedding = embed_text([input_text])[0]
83
  results = collection.query(
84
  query_embeddings=[input_embedding],
@@ -90,20 +90,20 @@ def retrieve_relevant_text(input_text, similarity_threshold=1.0): # Lower thres
90
  #print("Retrieved items and their similarity scores:")
91
  for metadata, distance in zip(results['metadatas'][0], results['distances'][0]):
92
  #print(f"Code: {metadata['code']}, Similarity Score: {distance}")
93
- if distance <= similarity_threshold:
94
- output.append({
95
- "similarity_score": distance,
96
- "code": metadata['code'],
97
- "code_system": metadata['code_system'],
98
- "description": metadata['description']
99
- })
100
 
101
- if not output:
102
- print("No results met the similarity threshold.")
103
  return output
104
 
105
  # Evaluate retrieval efficiency on the validation/test set
106
- def evaluate_efficiency(dataset_split, similarity_threshold=1.0):
107
  y_true = []
108
  y_pred = []
109
  total_similarity = 0
@@ -115,7 +115,7 @@ def evaluate_efficiency(dataset_split, similarity_threshold=1.0):
115
  annotations_list = [annotation['code'] for annotation in note.get('annotations', []) if 'code' in annotation]
116
 
117
  if text and annotations_list:
118
- retrieved_results = retrieve_relevant_text(text, similarity_threshold=similarity_threshold)
119
  retrieved_codes = [result['code'] for result in retrieved_results]
120
 
121
  # Sum up similarity scores for average calculation
@@ -153,7 +153,7 @@ def evaluate_efficiency(dataset_split, similarity_threshold=1.0):
153
  return precision, recall, f1, avg_similarity
154
 
155
  # Calculate retrieval efficiency metrics
156
- precision, recall, f1, avg_similarity = evaluate_efficiency(dataset['validation'], similarity_threshold=1.0)
157
 
158
  # Gradio interface
159
  def gradio_interface(input_text):
 
78
  upsert_data(dataset['train'])
79
 
80
  # Define retrieval function with similarity threshold
81
+ def retrieve_relevant_text(input_text):
82
  input_embedding = embed_text([input_text])[0]
83
  results = collection.query(
84
  query_embeddings=[input_embedding],
 
90
  #print("Retrieved items and their similarity scores:")
91
  for metadata, distance in zip(results['metadatas'][0], results['distances'][0]):
92
  #print(f"Code: {metadata['code']}, Similarity Score: {distance}")
93
+ #if distance <= similarity_threshold:
94
+ output.append({
95
+ "similarity_score": distance,
96
+ "code": metadata['code'],
97
+ "code_system": metadata['code_system'],
98
+ "description": metadata['description']
99
+ })
100
 
101
+ # if not output:
102
+ # print("No results met the similarity threshold.")
103
  return output
104
 
105
  # Evaluate retrieval efficiency on the validation/test set
106
+ def evaluate_efficiency(dataset_split):
107
  y_true = []
108
  y_pred = []
109
  total_similarity = 0
 
115
  annotations_list = [annotation['code'] for annotation in note.get('annotations', []) if 'code' in annotation]
116
 
117
  if text and annotations_list:
118
+ retrieved_results = retrieve_relevant_text(text)
119
  retrieved_codes = [result['code'] for result in retrieved_results]
120
 
121
  # Sum up similarity scores for average calculation
 
153
  return precision, recall, f1, avg_similarity
154
 
155
  # Calculate retrieval efficiency metrics
156
+ precision, recall, f1, avg_similarity = evaluate_efficiency(dataset['validation'])
157
 
158
  # Gradio interface
159
  def gradio_interface(input_text):