from docx import Document import pandas as pd import numpy as np from numpy.linalg import norm import ssl import plotly_express as px from scrape_onet import get_onet_code from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate from langchain_community.llms.ollama import Ollama from langchain_community.embeddings import OllamaEmbeddings from langchain.chains import LLMChain from langchain.output_parsers import CommaSeparatedListOutputParser from sentence_transformers import SentenceTransformer # SSL CERTIFICATE FIX try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context # LOAD DATA AND EMBEDDINGS: simdat = pd.read_csv('static/embeddings/onet_embeddings_st5.csv') tsne_dat = pd.read_csv('static/st5_tSNE_dat.csv') parser = CommaSeparatedListOutputParser() # LOAD MODELS: model = Ollama(model="mistral") embedding_model = SentenceTransformer('sentence-transformers/sentence-t5-base', device='cpu') # UTILITY FUNCTIONS: def remove_new_line(value): return ''.join(value.splitlines()) async def neighborhoods(jobtitle=None): def format_title(logo, title, subtitle, title_font_size = 28, subtitle_font_size=14): logo = f'{logo}' subtitle = f'{subtitle}' title = f'{title}' return f'{logo}{title}
{subtitle}' fig = px.scatter(tsne_dat, x = 'longitude', y = 'latitude', color = 'Category', hover_data = ['Category', 'Title'], title=format_title("Pathfinder", " Job Neighborhoods: Explore the Map!", "")) fig['layout'].update(height=1000, width=1500, font=dict(family='Courier New, monospace', color='black')) fig.write_html('templates/job_neighborhoods.html') def get_resume(resume): path = f"static/{resume.filename}" with open(path, 'wb') as buffer: buffer.write(resume.file.read()) file = Document(path) text = [] for para in file.paragraphs: text.append(para.text) resume = "\n".join(text) return resume def skill_extractor(resume): system_prompt_template = SystemMessagePromptTemplate.from_template(""" ### [INST] Instruction: You are an expert job analyst tasked with identifying both technical and soft skills in resumes. You always respond in the following format: 'skill1, skill2, skill3, ...' and never provide an explanation or justification for your response. For example, given the following statement in a resume: 'significant experience in python and familiarity with machine learning packages, such as sklearn, torch, and tensorflow' you respond: 'python, sklearn, torch, tensorflow'. [/INST] """) human_prompt_template = HumanMessagePromptTemplate.from_template(""" ### QUESTION: What skills are in the following resume?: {resume} """) prompt = ChatPromptTemplate.from_messages([system_prompt_template, human_prompt_template]) llm_chain = LLMChain(llm=model, prompt=prompt) result = llm_chain.invoke({"resume": resume}) result = remove_new_line(result['text']) return parser.parse(result) def skillEmbed(skills): embeddings = embedding_model.encode(skills) return embeddings async def sim_result_loop(skilltext): if type(skilltext) == str: skills = skilltext if type(skilltext) == dict: skills = [key for key, value in skilltext.items() if value == "Skill"] skills = str(skills).replace("'", "").replace(",", "") if type(skilltext) == list: skills = ', '.join(skilltext) embeds = skillEmbed(skills) def cosine(A, B): return np.dot(A,B)/(norm(A)*norm(B)) def format_sim(sim): return "{:0.2f}".format(sim) simResults = [] [simResults.append(cosine(np.array(embeds), np.array(simdat.iloc[i,1:]))) for i in range(len(simdat))] simResults = pd.DataFrame(simResults) simResults['JobTitle'] = simdat['Title'] simResults = simResults.iloc[:,[1,0]] simResults.columns = ['JobTitle', 'Similarity'] simResults = simResults.sort_values(by = "Similarity", ascending = False) simResults = simResults.iloc[:13,:] simResults = simResults.iloc[1:,:] simResults.reset_index(drop=True, inplace=True) if simResults['Similarity'].min() < 0.5: simResults['Similarity'] = simResults['Similarity'] + (0.5 - simResults['Similarity'].min()) if simResults['Similarity'].max() > 1.0: simResults['Similarity'] = simResults['Similarity'] - (simResults['Similarity'].max() - 1.0) for x in range(len(simResults)): simResults.iloc[x,1] = format_sim(simResults.iloc[x,1]) return simResults, embeds def get_links(simResults): links = [] titles = simResults["JobTitle"] [links.append("https://www.onetonline.org/link/summary/" + get_onet_code(title)) for title in titles] return links def sim_result_loop_jobFinder(skills): embeds = skillEmbed(skills) def cosine(A, B): return np.dot(A,B)/(norm(A)*norm(B)) def format_sim(sim): return "{:0.2f}".format(sim) jobdat = pd.read_csv('static/jd_embeddings.csv') jobembeds = jobdat.iloc[:,5:].dropna() simResults = [] [simResults.append(cosine(np.array(embeds), np.array(jobembeds.iloc[i,:]))) for i in range(len(jobembeds))] simResults = pd.DataFrame(simResults) simResults['job_id'] = jobdat['id'] simResults['emp_email'] = jobdat['email'] simResults = simResults.iloc[:,[1,2,0]] simResults.columns = ['job_id', 'employer_email', 'similarity'] simResults = simResults.sort_values(by = "similarity", ascending = False) simResults.reset_index(drop=True, inplace=True) for x in range(len(simResults)): simResults.iloc[x,2] = format_sim(simResults.iloc[x,2]) return simResults def sim_result_loop_candFinder(skills): embeds = skillEmbed(skills) def cosine(A, B): return np.dot(A,B)/(norm(A)*norm(B)) def format_sim(sim): return "{:0.2f}".format(sim) canddat = pd.read_csv('static/res_embeddings.csv') candembeds = canddat.iloc[:,5:].dropna() simResults = [] [simResults.append(cosine(np.array(embeds), np.array(candembeds.iloc[i,:]))) for i in range(len(candembeds))] simResults = pd.DataFrame(simResults) simResults['cand_id'] = canddat['id'] simResults['cand_email'] = canddat['email'] simResults = simResults.iloc[:,[1,2,0]] simResults.columns = ['candidate_id', 'candidate_email', 'similarity'] simResults = simResults.sort_values(by = "similarity", ascending = False) simResults.reset_index(drop=True, inplace=True) for x in range(len(simResults)): simResults.iloc[x,2] = format_sim(simResults.iloc[x,2]) return simResults