Spaces:

aibmedia
/

aibsimilarityllm

Sleeping

App Files Files Community

aibsimilarityllm / main.py

aibmedia

Update main.py

9b56a83 verified 25 days ago

raw

history blame

16.9 kB

	import os , json
	from flask import Flask, render_template
	import threading
	import time
	from pydantic.v1.utils import unique_list
	import requests

	from langchain_core.tools import Tool
	from langchain_google_community import GoogleSearchAPIWrapper, search
	from langchain_community.tools import DuckDuckGoSearchResults

	from langchain_community.utilities import DuckDuckGoSearchAPIWrapper



	API_URL0 = "/static-proxy?url=https%3A%2F%2Fapi-inference.huggingface.co%2Fmodels%2Fsentence-transformers%2Fmulti-qa-MiniLM-L6-cos-v1%26quot%3B%3C%2Fspan%3E%3C!-- HTML_TAG_END -->
	API_URL1 = "/static-proxy?url=https%3A%2F%2Fapi-inference.huggingface.co%2Fmodels%2Fsentence-transformers%2Fall-mpnet-base-v2%26quot%3B%3C%2Fspan%3E%3C!-- HTML_TAG_END -->
	API_URL2 = "/static-proxy?url=https%3A%2F%2Fapi-inference.huggingface.co%2Fmodels%2Fsentence-transformers%2Fall-roberta-large-v1%26quot%3B%3C%2Fspan%3E%3C!-- HTML_TAG_END -->
	API_URL3 = "/static-proxy?url=https%3A%2F%2Fapi-inference.huggingface.co%2Fmodels%2FSnowflake%2Fsnowflake-arctic-embed-l-v2.0%26quot%3B%3C%2Fspan%3E%3C!-- HTML_TAG_END -->
	# API_URL4 = "/static-proxy?url=https%3A%2F%2Fapi-inference.huggingface.co%2Fmodels%2Fsentence-transformers%2Fmulti-qa-MiniLM-L6-cos-v1%26quot%3B%3C%2Fspan%3E%3C!-- HTML_TAG_END -->



	search = GoogleSearchAPIWrapper()

	bearer = "Bearer " + os.getenv('TOKEN')
	headers = {"Authorization": bearer }
	print("headers")
	print(headers)

	app = Flask(__name__)

	@app.route('/app')
	def server_app():
	llamafile = threading.Thread(target=threadserver)
	print('This /app will start the llamafile server on thread')
	llamafile.start()
	return 'llamafile.start()'

	@app.route('/findsimilarity')
	def server_one():
	sourcesim = "Results"
	s1 = "Results"
	return render_template("similarity_1.html", sourcetxt = sourcesim, s1 = s1 , headertxt = bearer )


	@app.route('/')
	async def server_1():
	# TODO :: check html first then check similarity
	# TODO :: check parts of snipp to pass in the processing func
	query_sentence = "capital city of the Philippines"
	duck_results = []
	all_results = []

	try:
	searchduck = DuckDuckGoSearchResults(output_format="list", num_results=20)
	duck_results = searchduck.invoke(query_sentence)
	print("type of duck")
	print(type(duck_results))
	except:
	print("An exception occurred")
	duck_results = []

	if type(duck_results) == list and len(duck_results) > 0 :
	all_results = duck_results

	tool = Tool(
	name="google_search",
	description="Search Google for recent results.",
	func=search.run,
	)

	try:
	google_results = search.results( query_sentence , 10 )
	print("type(duck_results)")
	print(type(duck_results))
	print(type(all_results))
	except:
	print("An exception occurred")

	if type(google_results) == list and len(google_results) > 0:
	all_results = all_results + google_results
	print("len of google and duck")
	print(len(all_results))
	print(len(google_results))
	print(len(duck_results))
	print("type of google")
	print(type(google_results))
	# print(all_results)
	all_snipps = []
	new_results = []
	# get the snippet put into list
	split_query_words = query_sentence.split(); important_keywords = []; uppercased_keywords = [];
	for x in split_query_words:
	print(" x.isupper() ")
	# print(x)
	# print( x[0].isupper() )
	if x[0].isupper() == True :
	uppercased_keywords.append(x)
	if ( len(x) > 3 ) & ( x[0].isupper() == False ):
	important_keywords.append(x)
	print("what is important and upper")
	print(important_keywords)
	print(uppercased_keywords)
	snipp_score = 0
	capitalized_score = 0
	for x in all_results:
	snipp_score = 0
	capitalized_score = 0
	for words in important_keywords:
	# print("The important words " )
	# print(words)
	# print("x[snippet].find(words)")
	# print(x["snippet"].find(words))
	if x["snippet"].find(words) != -1 :
	# print("Found word")
	snipp_score = snipp_score + 1
	for words in uppercased_keywords:
	# print("The important words capitalized" )
	# print(words)
	if x["snippet"].find(words) != -1 :
	snipp_score = snipp_score + 1
	capitalized_score = capitalized_score + 1

	if ( snipp_score >= len(important_keywords) ) and ( ( capitalized_score <= len(uppercased_keywords) and capitalized_score > 0 ) or ( len(uppercased_keywords) == 0 ) ):
	new_results.append(x)
	continue
	if ( (snipp_score <= len(important_keywords) and snipp_score >= 2 ) and (len(important_keywords) <= 4) ) and ( (capitalized_score <= len(uppercased_keywords) and capitalized_score >= 1) or ( len(uppercased_keywords) == 0 ) ):
	new_results.append(x)
	continue
	if ( ( snipp_score <= len(important_keywords) and snipp_score >= 4 ) and ( len(important_keywords) >= 5 and len(important_keywords) <= 7 ) ) and ( ( capitalized_score <= len(uppercased_keywords) and capitalized_score > 0 ) or ( len(uppercased_keywords) == 0 ) ) :
	new_results.append(x)
	continue
	else :
	# skip the result
	print("This is not added")
	# print(x["snippet"])
	# print("important keywords")
	# print(important_keywords)
	# print("capitalized_score")
	# print(capitalized_score)
	# print("snipp_score")
	# print(snipp_score)

	# print("these are new_results")
	# print("===============================")

	# print(new_results)

	# print("these are new_results")
	# print("===============================")

	print( " len( new_results) ")
	print( len( new_results) )
	print("type of all_results")
	# TODO :: check html first then check similarity
	# TODO :: check parts of snipp to pass in the processing func
	# TODO :: pull pages and split each html and count occurance of important keywords here & check snipp if snipp occurs between . and <p> its good not img

	n_results = {}
	iter_x = 0
	for x in new_results:
	n_results[iter_x] = []
	print("x[snippet]")
	# print(x["snippet"])
	for y in (x["snippet"]).split('.') :
	score = 0 ; cap_score = 0 ;
	for words in important_keywords :
	if y.find(words) != -1 :
	# print(y)
	# print(score)
	score = score + 1
	for words in uppercased_keywords :
	if y.find(words) != -1 :
	# print(y)
	# print(cap_score)
	cap_score = cap_score + 1
	if ( score == ( len(important_keywords) ) ) and ( cap_score >= ( len(uppercased_keywords) ) ):
	n_results[iter_x].append(y)
	if ( score >= ( len(important_keywords)-1 ) ) or ( cap_score >= len(uppercased_keywords) and (len(uppercased_keywords) > 0) ):
	n_results[iter_x].append(y)
	iter_x = iter_x + 1
	# print("iterator")
	# print(iter_x)

	print("n_results length")
	print(len(n_results))
	print("nresults")
	sentences_comparison = []
	iter_x = 0
	for y in n_results :
	print("y")
	print(n_results[iter_x])
	print(y)
	# print(y)
	for x in n_results[iter_x] :
	sentences_comparison.append(x)
	iter_x = iter_x + 1

	print("sentences_comparison")
	print(sentences_comparison)

	# nresults={}
	# new_results loop
	# sentences loop
	# score = 0 ; cap_score = 0
	# words loop
	# if found score ++
	#
	# capitalized loop
	# if found cap_score ++
	# if cap_score >= len words && if score >= len words
	#
	#
	# nresults[i].append(x)











	# TODO :: check parts of snipp
	# TODO :: check parts of snipp
	# TODO :: check parts of snipp

	payload = { "inputs": { "source_sentence": "Manila is the capital city of the Philippines", "sentences": ["The current capital city, Manila, has been the countrys capital throughout most","Manila officially the City of Manila (Filipino: Lungsod ng Maynila),","Dis 4, 2024 — Manila, capital and chief city of the Philippines. The city is the centre ","Quezon City is the capital of the Philippines","Manila is the capital of the philippines","For sometime Manila has been the capital of of the Philippines" ,"What is the capital of Philippines","Manila is not the capital of the Phillipines","Quezon city was the capital of the Philippines, until President Ferdinand "] } , }
	response0 = requests.post(API_URL0, headers=headers, json=payload)
	response1 = requests.post(API_URL1, headers=headers, json=payload)
	response2 = requests.post(API_URL2, headers=headers, json=payload)
	response3 = requests.post(API_URL3, headers=headers, json=payload)

	varcontinue_similarity = 0
	print("type( response0.json() )")
	print(type( response0.json() ))
	print(type( response1.json() ))
	print(type( response2.json() ))
	print(type( response3.json() ))

	if type(response0.json()) == list and type(response1.json()) == list and type(response2.json()) == list and type(response3.json()) == list :
	similarity_scores = response0.json() + response1.json() + response2.json() + response3.json()
	# If all list then pass to process func
	sorted0 = sorted(response0.json() , reverse=True); sorted1 = sorted(response1.json() , reverse=True)
	sorted2 = sorted(response2.json() , reverse=True); sorted3 = sorted(response3.json() , reverse=True)
	varcontinue_similarity = 1
	else:
	similarity_scores = "There's an error in llm similarity search retrieval"
	return similarity_scores

	time.sleep(2)
	result_processed = ""
	## if response is all list
	if varcontinue_similarity == 1 :
	# call processing with 10 google search result or 15 search results
	if len(all_results) == 10 :
	result_processed = process_similarity_15(sorted0, sorted1, sorted2, sorted3,response0.json(), response1.json(), response2.json(), response3.json() )
	if len(all_results) > 10 :
	result_processed = process_similarity_15(sorted0, sorted1, sorted2, sorted3,response0.json(), response1.json(), response2.json(), response3.json() )
	# return all_results
	return result_processed



	def threadserver():
	print('hi')
	os.system(' ./mxbai-embed-large-v1-f16.llamafile --server --nobrowser')



	def process_similarity_15(sorted0, sorted1, sorted2, sorted3, actualscore0, actualscore1, actualscore2, actualscore3):

	# print(similarity_scores)
	# print(type(similarity_scores))
	print("length")
	# print(len(similarity_scores))
	key_index = 0
	# copy + loop to get index
	print("actual scores")
	print("actual scores")
	print(actualscore0)
	print(actualscore1)
	print(actualscore2)
	print(actualscore3)

	print("the sorted0-3")
	print("the sorted0-3")
	print(sorted0)
	print(sorted1)
	print(sorted2)
	print(sorted3)
	print("end the sorted0-3")
	# Get the index of the sorted list for resp_list0

	sorted0_with_index = []
	for x in sorted0:
	for y in actualscore0:
	if x == y:
	print("index of sorted0")
	print(actualscore0.index(y))
	if x > 0.90:
	sorted0_with_index.append(actualscore0.index(y))
	print("sorted_with_index")
	print(sorted0_with_index)
	print("sorted0_with_index")
	print(sorted0_with_index)
	sorted1_with_index = []
	for x in sorted1:
	for y in actualscore1:
	if x == y:
	print("index of sorted1")
	print(actualscore1.index(y))
	if y > 0.90:
	sorted1_with_index.append(actualscore1.index(y))
	print("sorted_with_index")
	print(sorted1_with_index)

	print("sorted1_with_index")
	print(sorted1_with_index)

	sorted2_with_index = []
	print("b4 for x in sorted2:")
	print("resp_list2:" + str(actualscore2))
	print("sorted:" + str(sorted2))
	for x in sorted2:
	for y in actualscore2:
	if x == y:
	print("index of sorted2")
	print(actualscore2.index(y))
	if y > 0.90:
	sorted2_with_index.append(actualscore2.index(y))
	print("sorted_with_index")
	print(sorted2_with_index)

	print("sorted2_with_index")
	print(sorted2_with_index)
	sorted3_with_index = []
	print("b4 for x in sorted3:")
	print("resp_list3:" + str(actualscore3))
	for x in sorted3:
	for y in actualscore3:
	if x == y:
	print("index of sorted3")
	print(actualscore3.index(y))
	if y > 0.90:
	sorted3_with_index.append(actualscore3.index(y))
	print("sorted_with_index")
	print(sorted3_with_index)

	print("sorted0-3_with_index")
	print("sorted0-3_with_index")
	print(sorted0_with_index)
	print(sorted1_with_index)
	print(sorted2_with_index)
	print(sorted3_with_index)
	print("sorted0-3_with_index")


	# At this point the scores have been sorted also indexes are stored in lists
	# At this point the scores have been sorted also indexes are stored in lists
	this_unique_list = set( sorted0_with_index + sorted1_with_index + sorted2_with_index + sorted3_with_index )
	webgraph_list = []
	iterator_x = 0
	for x in sorted0_with_index:
	print(x)
	if ( x in sorted3_with_index and x in sorted1_with_index and x in sorted2_with_index ) :
	webgraph_list.append(x)
	if ( x in sorted1_with_index and x in sorted2_with_index ) or ( x in sorted3_with_index and x in sorted2_with_index ) or ( x in sorted1_with_index and x in sorted3_with_index ):
	webgraph_list.append(x)
	if (x in sorted1_with_index or x in sorted2_with_index or x in sorted3_with_index ) and actualscore0[iterator_x] > 0.96 :
	webgraph_list.append(x)
	iterator_x = iterator_x + 1

	print("webgraph_list0")
	print("webgraph_list0")
	print(webgraph_list)
	iterator_x = 0
	for x in sorted1_with_index:
	print(x)
	if x in sorted3_with_index and x in sorted0_with_index and x in sorted2_with_index :
	webgraph_list.append(x)
	if ( x in sorted0_with_index and x in sorted2_with_index ) or ( x in sorted3_with_index and x in sorted2_with_index ) or ( x in sorted0_with_index and x in sorted3_with_index ):
	webgraph_list.append(x)
	if (x in sorted0_with_index or x in sorted2_with_index or x in sorted3_with_index ) and actualscore1[iterator_x] > 0.96 :
	webgraph_list.append(x)
	iterator_x = iterator_x + 1

	print("webgraph_list1")
	print("webgraph_list1")
	print(webgraph_list)


	iterator_x = 0
	for x in sorted2_with_index:
	print(x)
	if x in sorted3_with_index and x in sorted0_with_index and x in sorted1_with_index :
	webgraph_list.append(x)
	if ( x in sorted0_with_index and x in sorted1_with_index ) or ( x in sorted3_with_index and x in sorted1_with_index ) or ( x in sorted0_with_index and x in sorted3_with_index ):
	webgraph_list.append(x)
	if (x in sorted0_with_index or x in sorted1_with_index or x in sorted3_with_index ) and actualscore2[iterator_x] > 0.96 :
	webgraph_list.append(x)
	iterator_x = iterator_x + 1

	print("webgraph_list2")
	print("webgraph_list2")
	print(webgraph_list)


	iterator_x = 0
	for x in sorted3_with_index:
	print(x)
	if x in sorted1_with_index and x in sorted0_with_index and x in sorted2_with_index :
	webgraph_list.append(x)
	if ( x in sorted0_with_index and x in sorted2_with_index ) or ( x in sorted1_with_index and x in sorted2_with_index ) or ( x in sorted0_with_index and x in sorted1_with_index ):
	webgraph_list.append(x)
	if (x in sorted0_with_index or x in sorted2_with_index or x in sorted1_with_index ) and actualscore3[iterator_x] > 0.96 :
	webgraph_list.append(x)
	iterator_x = iterator_x + 1

	print("webgraph_list3")
	print("webgraph_list3")
	print(webgraph_list)
	print("webgraph_list")
	print(webgraph_list)


	return str( list(set(webgraph_list ) ) )


	if __name__ == '__main__':
	app.run(host='0.0.0.0', port=8081)

	# server_app()