Spaces:
Runtime error
Runtime error
from docx import Document | |
import pandas as pd | |
import numpy as np | |
from numpy.linalg import norm | |
import ssl | |
import plotly_express as px | |
from scrape_onet import get_onet_code | |
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate | |
from langchain_community.llms.ollama import Ollama | |
from langchain_community.embeddings import OllamaEmbeddings | |
from langchain.chains import LLMChain | |
from langchain.output_parsers import CommaSeparatedListOutputParser | |
from sentence_transformers import SentenceTransformer | |
# SSL CERTIFICATE FIX | |
try: | |
_create_unverified_https_context = ssl._create_unverified_context | |
except AttributeError: | |
pass | |
else: | |
ssl._create_default_https_context = _create_unverified_https_context | |
# LOAD DATA AND EMBEDDINGS: | |
simdat = pd.read_csv('static/embeddings/onet_embeddings_st5.csv') | |
tsne_dat = pd.read_csv('static/st5_tSNE_dat.csv') | |
parser = CommaSeparatedListOutputParser() | |
# LOAD MODELS: | |
model = Ollama(model="mistral") | |
embedding_model = SentenceTransformer('sentence-transformers/sentence-t5-base', device='cpu') | |
# UTILITY FUNCTIONS: | |
def remove_new_line(value): | |
return ''.join(value.splitlines()) | |
async def neighborhoods(jobtitle=None): | |
def format_title(logo, title, subtitle, title_font_size = 28, subtitle_font_size=14): | |
logo = f'<a href="/" target="_self">{logo}</a>' | |
subtitle = f'<span style="font-size: {subtitle_font_size}px;">{subtitle}</span>' | |
title = f'<span style="font-size: {title_font_size}px;">{title}</span>' | |
return f'{logo}{title}<br>{subtitle}' | |
fig = px.scatter(tsne_dat, x = 'longitude', y = 'latitude', color = 'Category', hover_data = ['Category', 'Title'], | |
title=format_title("Pathfinder", " Job Neighborhoods: Explore the Map!", "")) | |
fig['layout'].update(height=1000, width=1500, font=dict(family='Courier New, monospace', color='black')) | |
fig.write_html('templates/job_neighborhoods.html') | |
def get_resume(resume): | |
path = f"static/{resume.filename}" | |
with open(path, 'wb') as buffer: | |
buffer.write(resume.file.read()) | |
file = Document(path) | |
text = [] | |
for para in file.paragraphs: | |
text.append(para.text) | |
resume = "\n".join(text) | |
return resume | |
def skill_extractor(resume): | |
system_prompt_template = SystemMessagePromptTemplate.from_template(""" | |
### [INST] | |
Instruction: You are an expert job analyst tasked with identifying both technical and soft skills in resumes. | |
You always respond in the following format: 'skill1, skill2, skill3, ...' and never provide an explanation or justification for your response. | |
For example, given the following statement in a resume: 'significant experience in python and familiarity with machine learning packages, such as sklearn, torch, and tensorflow' | |
you respond: 'python, sklearn, torch, tensorflow'. | |
[/INST] | |
""") | |
human_prompt_template = HumanMessagePromptTemplate.from_template(""" | |
### QUESTION: | |
What skills are in the following resume?: | |
{resume} | |
""") | |
prompt = ChatPromptTemplate.from_messages([system_prompt_template, human_prompt_template]) | |
llm_chain = LLMChain(llm=model, prompt=prompt) | |
result = llm_chain.invoke({"resume": resume}) | |
result = remove_new_line(result['text']) | |
return parser.parse(result) | |
def skillEmbed(skills): | |
embeddings = embedding_model.encode(skills) | |
return embeddings | |
async def sim_result_loop(skilltext): | |
if type(skilltext) == str: | |
skills = skilltext | |
if type(skilltext) == dict: | |
skills = [key for key, value in skilltext.items() if value == "Skill"] | |
skills = str(skills).replace("'", "").replace(",", "") | |
if type(skilltext) == list: | |
skills = ', '.join(skilltext) | |
embeds = skillEmbed(skills) | |
def cosine(A, B): | |
return np.dot(A,B)/(norm(A)*norm(B)) | |
def format_sim(sim): | |
return "{:0.2f}".format(sim) | |
simResults = [] | |
[simResults.append(cosine(np.array(embeds), np.array(simdat.iloc[i,1:]))) for i in range(len(simdat))] | |
simResults = pd.DataFrame(simResults) | |
simResults['JobTitle'] = simdat['Title'] | |
simResults = simResults.iloc[:,[1,0]] | |
simResults.columns = ['JobTitle', 'Similarity'] | |
simResults = simResults.sort_values(by = "Similarity", ascending = False) | |
simResults = simResults.iloc[:13,:] | |
simResults = simResults.iloc[1:,:] | |
simResults.reset_index(drop=True, inplace=True) | |
if simResults['Similarity'].min() < 0.5: | |
simResults['Similarity'] = simResults['Similarity'] + (0.5 - simResults['Similarity'].min()) | |
if simResults['Similarity'].max() > 1.0: | |
simResults['Similarity'] = simResults['Similarity'] - (simResults['Similarity'].max() - 1.0) | |
for x in range(len(simResults)): | |
simResults.iloc[x,1] = format_sim(simResults.iloc[x,1]) | |
return simResults, embeds | |
def get_links(simResults): | |
links = [] | |
titles = simResults["JobTitle"] | |
[links.append("https://www.onetonline.org/link/summary/" + get_onet_code(title)) for title in titles] | |
return links | |
def sim_result_loop_jobFinder(skills): | |
embeds = skillEmbed(skills) | |
def cosine(A, B): | |
return np.dot(A,B)/(norm(A)*norm(B)) | |
def format_sim(sim): | |
return "{:0.2f}".format(sim) | |
jobdat = pd.read_csv('static/jd_embeddings.csv') | |
jobembeds = jobdat.iloc[:,5:].dropna() | |
simResults = [] | |
[simResults.append(cosine(np.array(embeds), np.array(jobembeds.iloc[i,:]))) for i in range(len(jobembeds))] | |
simResults = pd.DataFrame(simResults) | |
simResults['job_id'] = jobdat['id'] | |
simResults['emp_email'] = jobdat['email'] | |
simResults = simResults.iloc[:,[1,2,0]] | |
simResults.columns = ['job_id', 'employer_email', 'similarity'] | |
simResults = simResults.sort_values(by = "similarity", ascending = False) | |
simResults.reset_index(drop=True, inplace=True) | |
for x in range(len(simResults)): | |
simResults.iloc[x,2] = format_sim(simResults.iloc[x,2]) | |
return simResults | |
def sim_result_loop_candFinder(skills): | |
embeds = skillEmbed(skills) | |
def cosine(A, B): | |
return np.dot(A,B)/(norm(A)*norm(B)) | |
def format_sim(sim): | |
return "{:0.2f}".format(sim) | |
canddat = pd.read_csv('static/res_embeddings.csv') | |
candembeds = canddat.iloc[:,5:].dropna() | |
simResults = [] | |
[simResults.append(cosine(np.array(embeds), np.array(candembeds.iloc[i,:]))) for i in range(len(candembeds))] | |
simResults = pd.DataFrame(simResults) | |
simResults['cand_id'] = canddat['id'] | |
simResults['cand_email'] = canddat['email'] | |
simResults = simResults.iloc[:,[1,2,0]] | |
simResults.columns = ['candidate_id', 'candidate_email', 'similarity'] | |
simResults = simResults.sort_values(by = "similarity", ascending = False) | |
simResults.reset_index(drop=True, inplace=True) | |
for x in range(len(simResults)): | |
simResults.iloc[x,2] = format_sim(simResults.iloc[x,2]) | |
return simResults |