Tools / database /data /script.py
ZackBradshaw's picture
Upload folder using huggingface_hub
e67043b verified
import json
import json
from difflib import ndiff
import psycopg2
import time
"""
prepare the test samples
"""
def execute_sql(sql):
conn = psycopg2.connect(
database="tpch10x", user="xxx", password="xxx", host="xxx", port=xxx
)
cur = conn.cursor()
cur.execute(sql)
# res = cur.fetchall()[0][0][0]
res = cur.fetchall()
conn.commit()
cur.close()
conn.close()
return len(res)
# Load the JSON file as a dictionary
data = {}
with open("text2res_single_table.json", "r") as f:
data = json.load(f)
# Select only the diverse SQL statements
# Find SQL statements with an edit distance of less than 10
selected_sql = []
for sql1 in data:
if "sql" in sql1:
sql1 = sql1["sql"]
print("==========sql", sql1)
start_time = time.time()
res_cnt = execute_sql(sql1)
end_time = time.time()
elapsed_time = end_time - start_time
print(res_cnt, elapsed_time)
selected_sql.append(
{f"sql": sql1, "res_cnt": res_cnt, "execution_time": elapsed_time}
)
# Write the dictionary to a JSON file
with open("text2res_single_table2.json", "w") as f:
json.dump(selected_sql, f)
"""
add text descriptions for queries
"""
if __name__ == "__main__":
llm = LLM() # add the def of your llm
with open("./tpch10x/text2res_single_table2.json", "r") as json_file:
json_data = json.load(json_file)
new_json_data = []
for i, item in enumerate(json_data):
sql = item["sql"]
print("========= ", i, sql)
prompt = (
"Please convert the following sql query into one natural language sentence: \n"
+ sql
+ "\n Note. 1) Do not mention any other information other than the natural language sentence; 2) Must use the origin table and column names in the sql query."
)
text = llm(prompt)
item["text"] = text
new_json_data.append(item)
# print(llm("Describe Shanghai in 200 words."))
with open("text2res_single_table3.json", "w") as f:
json.dump(new_json_data, f)
"""
calculate total execution time
"""
with open("text2res_origin.json", "r") as json_file:
json_data = json.load(json_file)
total_time = 0
for i, item in enumerate(json_data):
print(item["execution_time"])
total_time = total_time + float(item["execution_time"])
print(total_time)