File size: 6,976 Bytes
9b3b1bc
 
 
 
 
8de44db
2f1b978
54919c4
 
 
 
 
b7c28ad
9b3b1bc
 
 
 
 
 
 
 
 
b7c28ad
 
 
54919c4
9b3b1bc
b7c28ad
 
 
 
 
54919c4
 
 
b7c28ad
8de44db
 
 
 
 
b7c28ad
 
8de44db
 
 
9b3b1bc
 
 
 
 
 
 
 
 
 
 
54919c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64a3b45
b7c28ad
54919c4
 
b7c28ad
 
 
 
 
 
 
 
64a3b45
b7c28ad
9b3b1bc
 
b7c28ad
793cdd0
 
b7c28ad
9b3b1bc
793cdd0
9b3b1bc
 
 
 
 
 
 
 
858ed02
 
 
 
9b3b1bc
793cdd0
2b80df9
9b3b1bc
2f1b978
 
 
 
 
7c993c2
64a3b45
 
7c993c2
 
 
 
 
 
 
 
 
 
85d30b1
 
 
 
7c993c2
 
85d30b1
7c993c2
85d30b1
64a3b45
 
85d30b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
from docx import Document
import pandas as pd
import numpy as np
from numpy.linalg import norm
import ssl
import plotly_express as px
from scrape_onet import get_onet_code
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_community.llms.ollama import Ollama
from langchain_community.embeddings import OllamaEmbeddings
from langchain.chains import LLMChain
from langchain.output_parsers import CommaSeparatedListOutputParser
from sentence_transformers import SentenceTransformer

# SSL CERTIFICATE FIX
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# LOAD DATA AND EMBEDDINGS:
simdat = pd.read_csv('static/embeddings/onet_embeddings_st5.csv')
tsne_dat = pd.read_csv('static/st5_tSNE_dat.csv')
parser = CommaSeparatedListOutputParser()

# LOAD MODELS:
model = Ollama(model="mistral")
embedding_model = SentenceTransformer('sentence-transformers/sentence-t5-base', device='cpu')

# UTILITY FUNCTIONS:
def remove_new_line(value):
        return ''.join(value.splitlines())

async def neighborhoods(jobtitle=None):
    def format_title(logo, title, subtitle, title_font_size = 28, subtitle_font_size=14):
        logo = f'<a href="/" target="_self">{logo}</a>'
        subtitle = f'<span style="font-size: {subtitle_font_size}px;">{subtitle}</span>'
        title = f'<span style="font-size: {title_font_size}px;">{title}</span>'
        return f'{logo}{title}<br>{subtitle}'
    fig = px.scatter(tsne_dat, x = 'longitude', y = 'latitude', color = 'Category', hover_data = ['Category', 'Title'], 
        title=format_title("Pathfinder", "     Job Neighborhoods: Explore the Map!", ""))
    fig['layout'].update(height=1000, width=1500, font=dict(family='Courier New, monospace', color='black'))
    fig.write_html('templates/job_neighborhoods.html')

def get_resume(resume):
    path = f"static/{resume.filename}"
    with open(path, 'wb') as buffer:
        buffer.write(resume.file.read())
    file = Document(path)
    text = []
    for para in file.paragraphs:
        text.append(para.text)
    resume = "\n".join(text)
    return resume

def skill_extractor(resume):
     system_prompt_template = SystemMessagePromptTemplate.from_template("""
     ### [INST] 
     Instruction: You are an expert job analyst tasked with identifying both technical and soft skills in resumes.
     You always respond in the following format: 'skill1, skill2, skill3, ...' and never provide an explanation or justification for your response. 
     For example, given the following statement in a resume: 'significant experience in python and familiarity with machine learning packages, such as sklearn, torch, and tensorflow' 
     you respond: 'python, sklearn, torch, tensorflow'. 
     [/INST]
     """)
     
     human_prompt_template = HumanMessagePromptTemplate.from_template("""
     ### QUESTION:
     What skills are in the following resume?: 
     {resume}
     """)

     prompt = ChatPromptTemplate.from_messages([system_prompt_template, human_prompt_template])
     llm_chain = LLMChain(llm=model, prompt=prompt)

     result = llm_chain.invoke({"resume": resume})
     result = remove_new_line(result['text'])
     return parser.parse(result)

def skillEmbed(skills):
    embeddings = embedding_model.encode(skills)
    return embeddings

async def sim_result_loop(skilltext):
    if type(skilltext) == str:
        skills = skilltext
    if type(skilltext) == dict:
        skills = [key for key, value in skilltext.items() if value == "Skill"]
        skills = str(skills).replace("'", "").replace(",", "")
    if type(skilltext) == list: 
        skills = ', '.join(skilltext)
    embeds = skillEmbed(skills)
    
    def cosine(A, B):
        return np.dot(A,B)/(norm(A)*norm(B))
    
    def format_sim(sim):
        return "{:0.2f}".format(sim)
    
    simResults = []
    [simResults.append(cosine(np.array(embeds), np.array(simdat.iloc[i,1:]))) for i in range(len(simdat))]
    simResults = pd.DataFrame(simResults)
    simResults['JobTitle'] = simdat['Title']
    simResults = simResults.iloc[:,[1,0]]
    simResults.columns = ['JobTitle', 'Similarity']
    simResults = simResults.sort_values(by = "Similarity", ascending = False)
    simResults = simResults.iloc[:13,:]
    simResults = simResults.iloc[1:,:]
    simResults.reset_index(drop=True, inplace=True)
    if simResults['Similarity'].min() < 0.5:
        simResults['Similarity'] = simResults['Similarity'] + (0.5 - simResults['Similarity'].min())
        if simResults['Similarity'].max() > 1.0:
            simResults['Similarity'] = simResults['Similarity'] - (simResults['Similarity'].max() - 1.0)
    for x in range(len(simResults)):
        simResults.iloc[x,1] = format_sim(simResults.iloc[x,1])
    return simResults, embeds

def get_links(simResults):
    links = []
    titles = simResults["JobTitle"]
    [links.append("https://www.onetonline.org/link/summary/" + get_onet_code(title)) for title in titles]
    return links

def sim_result_loop_jobFinder(skills):
    embeds = skillEmbed(skills)
    def cosine(A, B):
        return np.dot(A,B)/(norm(A)*norm(B))
    def format_sim(sim):
        return "{:0.2f}".format(sim)
    jobdat = pd.read_csv('static/jd_embeddings.csv')
    jobembeds = jobdat.iloc[:,5:].dropna()
    simResults = []
    [simResults.append(cosine(np.array(embeds), np.array(jobembeds.iloc[i,:]))) for i in range(len(jobembeds))]
    simResults = pd.DataFrame(simResults)
    simResults['job_id'] = jobdat['id']
    simResults['emp_email'] = jobdat['email']
    simResults = simResults.iloc[:,[1,2,0]]
    simResults.columns = ['job_id', 'employer_email', 'similarity']
    simResults = simResults.sort_values(by = "similarity", ascending = False)
    simResults.reset_index(drop=True, inplace=True)
    for x in range(len(simResults)):
        simResults.iloc[x,2] = format_sim(simResults.iloc[x,2])
    return simResults

def sim_result_loop_candFinder(skills):
    embeds = skillEmbed(skills)
    def cosine(A, B):
        return np.dot(A,B)/(norm(A)*norm(B))
    def format_sim(sim):
        return "{:0.2f}".format(sim)
    canddat = pd.read_csv('static/res_embeddings.csv')
    candembeds = canddat.iloc[:,5:].dropna()
    simResults = []
    [simResults.append(cosine(np.array(embeds), np.array(candembeds.iloc[i,:]))) for i in range(len(candembeds))]
    simResults = pd.DataFrame(simResults)
    simResults['cand_id'] = canddat['id']
    simResults['cand_email'] = canddat['email']
    simResults = simResults.iloc[:,[1,2,0]]
    simResults.columns = ['candidate_id', 'candidate_email', 'similarity']
    simResults = simResults.sort_values(by = "similarity", ascending = False)
    simResults.reset_index(drop=True, inplace=True)
    for x in range(len(simResults)):
        simResults.iloc[x,2] = format_sim(simResults.iloc[x,2])
    return simResults