Spaces:
Runtime error
Runtime error
File size: 6,976 Bytes
9b3b1bc 8de44db 2f1b978 54919c4 b7c28ad 9b3b1bc b7c28ad 54919c4 9b3b1bc b7c28ad 54919c4 b7c28ad 8de44db b7c28ad 8de44db 9b3b1bc 54919c4 64a3b45 b7c28ad 54919c4 b7c28ad 64a3b45 b7c28ad 9b3b1bc b7c28ad 793cdd0 b7c28ad 9b3b1bc 793cdd0 9b3b1bc 858ed02 9b3b1bc 793cdd0 2b80df9 9b3b1bc 2f1b978 7c993c2 64a3b45 7c993c2 85d30b1 7c993c2 85d30b1 7c993c2 85d30b1 64a3b45 85d30b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
from docx import Document
import pandas as pd
import numpy as np
from numpy.linalg import norm
import ssl
import plotly_express as px
from scrape_onet import get_onet_code
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_community.llms.ollama import Ollama
from langchain_community.embeddings import OllamaEmbeddings
from langchain.chains import LLMChain
from langchain.output_parsers import CommaSeparatedListOutputParser
from sentence_transformers import SentenceTransformer
# SSL CERTIFICATE FIX
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
# LOAD DATA AND EMBEDDINGS:
simdat = pd.read_csv('static/embeddings/onet_embeddings_st5.csv')
tsne_dat = pd.read_csv('static/st5_tSNE_dat.csv')
parser = CommaSeparatedListOutputParser()
# LOAD MODELS:
model = Ollama(model="mistral")
embedding_model = SentenceTransformer('sentence-transformers/sentence-t5-base', device='cpu')
# UTILITY FUNCTIONS:
def remove_new_line(value):
return ''.join(value.splitlines())
async def neighborhoods(jobtitle=None):
def format_title(logo, title, subtitle, title_font_size = 28, subtitle_font_size=14):
logo = f'<a href="/" target="_self">{logo}</a>'
subtitle = f'<span style="font-size: {subtitle_font_size}px;">{subtitle}</span>'
title = f'<span style="font-size: {title_font_size}px;">{title}</span>'
return f'{logo}{title}<br>{subtitle}'
fig = px.scatter(tsne_dat, x = 'longitude', y = 'latitude', color = 'Category', hover_data = ['Category', 'Title'],
title=format_title("Pathfinder", " Job Neighborhoods: Explore the Map!", ""))
fig['layout'].update(height=1000, width=1500, font=dict(family='Courier New, monospace', color='black'))
fig.write_html('templates/job_neighborhoods.html')
def get_resume(resume):
path = f"static/{resume.filename}"
with open(path, 'wb') as buffer:
buffer.write(resume.file.read())
file = Document(path)
text = []
for para in file.paragraphs:
text.append(para.text)
resume = "\n".join(text)
return resume
def skill_extractor(resume):
system_prompt_template = SystemMessagePromptTemplate.from_template("""
### [INST]
Instruction: You are an expert job analyst tasked with identifying both technical and soft skills in resumes.
You always respond in the following format: 'skill1, skill2, skill3, ...' and never provide an explanation or justification for your response.
For example, given the following statement in a resume: 'significant experience in python and familiarity with machine learning packages, such as sklearn, torch, and tensorflow'
you respond: 'python, sklearn, torch, tensorflow'.
[/INST]
""")
human_prompt_template = HumanMessagePromptTemplate.from_template("""
### QUESTION:
What skills are in the following resume?:
{resume}
""")
prompt = ChatPromptTemplate.from_messages([system_prompt_template, human_prompt_template])
llm_chain = LLMChain(llm=model, prompt=prompt)
result = llm_chain.invoke({"resume": resume})
result = remove_new_line(result['text'])
return parser.parse(result)
def skillEmbed(skills):
embeddings = embedding_model.encode(skills)
return embeddings
async def sim_result_loop(skilltext):
if type(skilltext) == str:
skills = skilltext
if type(skilltext) == dict:
skills = [key for key, value in skilltext.items() if value == "Skill"]
skills = str(skills).replace("'", "").replace(",", "")
if type(skilltext) == list:
skills = ', '.join(skilltext)
embeds = skillEmbed(skills)
def cosine(A, B):
return np.dot(A,B)/(norm(A)*norm(B))
def format_sim(sim):
return "{:0.2f}".format(sim)
simResults = []
[simResults.append(cosine(np.array(embeds), np.array(simdat.iloc[i,1:]))) for i in range(len(simdat))]
simResults = pd.DataFrame(simResults)
simResults['JobTitle'] = simdat['Title']
simResults = simResults.iloc[:,[1,0]]
simResults.columns = ['JobTitle', 'Similarity']
simResults = simResults.sort_values(by = "Similarity", ascending = False)
simResults = simResults.iloc[:13,:]
simResults = simResults.iloc[1:,:]
simResults.reset_index(drop=True, inplace=True)
if simResults['Similarity'].min() < 0.5:
simResults['Similarity'] = simResults['Similarity'] + (0.5 - simResults['Similarity'].min())
if simResults['Similarity'].max() > 1.0:
simResults['Similarity'] = simResults['Similarity'] - (simResults['Similarity'].max() - 1.0)
for x in range(len(simResults)):
simResults.iloc[x,1] = format_sim(simResults.iloc[x,1])
return simResults, embeds
def get_links(simResults):
links = []
titles = simResults["JobTitle"]
[links.append("https://www.onetonline.org/link/summary/" + get_onet_code(title)) for title in titles]
return links
def sim_result_loop_jobFinder(skills):
embeds = skillEmbed(skills)
def cosine(A, B):
return np.dot(A,B)/(norm(A)*norm(B))
def format_sim(sim):
return "{:0.2f}".format(sim)
jobdat = pd.read_csv('static/jd_embeddings.csv')
jobembeds = jobdat.iloc[:,5:].dropna()
simResults = []
[simResults.append(cosine(np.array(embeds), np.array(jobembeds.iloc[i,:]))) for i in range(len(jobembeds))]
simResults = pd.DataFrame(simResults)
simResults['job_id'] = jobdat['id']
simResults['emp_email'] = jobdat['email']
simResults = simResults.iloc[:,[1,2,0]]
simResults.columns = ['job_id', 'employer_email', 'similarity']
simResults = simResults.sort_values(by = "similarity", ascending = False)
simResults.reset_index(drop=True, inplace=True)
for x in range(len(simResults)):
simResults.iloc[x,2] = format_sim(simResults.iloc[x,2])
return simResults
def sim_result_loop_candFinder(skills):
embeds = skillEmbed(skills)
def cosine(A, B):
return np.dot(A,B)/(norm(A)*norm(B))
def format_sim(sim):
return "{:0.2f}".format(sim)
canddat = pd.read_csv('static/res_embeddings.csv')
candembeds = canddat.iloc[:,5:].dropna()
simResults = []
[simResults.append(cosine(np.array(embeds), np.array(candembeds.iloc[i,:]))) for i in range(len(candembeds))]
simResults = pd.DataFrame(simResults)
simResults['cand_id'] = canddat['id']
simResults['cand_email'] = canddat['email']
simResults = simResults.iloc[:,[1,2,0]]
simResults.columns = ['candidate_id', 'candidate_email', 'similarity']
simResults = simResults.sort_values(by = "similarity", ascending = False)
simResults.reset_index(drop=True, inplace=True)
for x in range(len(simResults)):
simResults.iloc[x,2] = format_sim(simResults.iloc[x,2])
return simResults |