celise88 commited on
Commit
e1f9362
·
1 Parent(s): d305272

provide documentation

Browse files
Files changed (2) hide show
  1. main.py +76 -56
  2. static/cohere_tSNE_dat.csv +0 -3
main.py CHANGED
@@ -1,3 +1,12 @@
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI, Request, Form, File, UploadFile
2
  from fastapi.templating import Jinja2Templates
3
  from fastapi.staticfiles import StaticFiles
@@ -18,46 +27,94 @@ from nltk.tokenize import SpaceTokenizer
18
  import nltk
19
  from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
20
  from dotenv import load_dotenv
 
 
21
  load_dotenv()
22
 
 
23
  try:
24
  _create_unverified_https_context = ssl._create_unverified_context
25
  except AttributeError:
26
  pass
27
  else:
28
  ssl._create_default_https_context = _create_unverified_https_context
 
 
29
  if os.path.isdir('nltk_data')==False:
30
  nltk.download('stopwords', quiet=True)
31
 
 
32
  app = FastAPI()
33
  app.mount("/static", StaticFiles(directory='static'), name="static")
34
  templates = Jinja2Templates(directory="templates/")
35
 
 
36
  onet = pd.read_csv('static/ONET_JobTitles.csv')
37
  simdat = pd.read_csv('static/cohere_embeddings.csv')
38
- coheredat = pd.read_csv("static/cohere_tSNE_dat.csv")
39
 
 
 
40
  model = AutoModelForSequenceClassification.from_pretrained('static/model_shards', low_cpu_mem_usage=True)
41
  tokenizer = AutoTokenizer.from_pretrained('static/tokenizer_shards', low_cpu_mem_usage=True)
42
  classifier = pipeline('text-classification', model = model, tokenizer = tokenizer)
43
 
44
- ### job information center ###
45
- # get
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  @app.get("/")
47
  def render_job_list(request: Request):
48
  joblist = onet['JobTitle']
49
  return templates.TemplateResponse('job_list.html', context={'request': request, 'joblist': joblist})
50
 
51
- # post
52
  @app.post("/")
53
  def render_job_info(request: Request, jobtitle: str = Form(enum=[x for x in onet['JobTitle']])):
54
-
55
- def remove_new_line(value):
56
- return ''.join(value.splitlines())
57
-
58
  joblist = onet['JobTitle']
59
-
60
  if jobtitle:
 
61
  onetCode = onet.loc[onet['JobTitle'] == jobtitle, 'onetCode']
62
  onetCode = onetCode.reindex().tolist()[0]
63
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
@@ -76,6 +133,7 @@ def render_job_info(request: Request, jobtitle: str = Form(enum=[x for x in onet
76
  tasks = remove_new_line(tasks).replace("related occupations", " ").replace("core", " - ").replace(" )importance category task", "").replace(" find ", "")
77
  tasks = tasks.split(". ")
78
  tasks = [''.join(map(lambda c: '' if c in '0123456789-' else c, task)) for task in tasks]
 
79
  return templates.TemplateResponse('job_list.html', context={
80
  'request': request,
81
  'joblist': joblist,
@@ -83,20 +141,22 @@ def render_job_info(request: Request, jobtitle: str = Form(enum=[x for x in onet
83
  'jobdescription': jobdescription,
84
  'tasks': tasks})
85
 
86
- ### job neighborhoods ###
87
  @app.get("/explore-job-neighborhoods/", response_class=HTMLResponse)
88
  def render_job_neighborhoods(request: Request):
89
  return templates.TemplateResponse('job_neighborhoods.html', context={'request': request})
90
 
91
- ### find my match ###
92
- # get
93
  @app.get("/find-my-match/", response_class=HTMLResponse)
94
- async def match_page(request: Request):
95
  return templates.TemplateResponse('find_my_match.html', context={'request': request})
96
 
97
- # post
98
  @app.post('/find-my-match/', response_class=HTMLResponse)
99
- def get_resume(request: Request, resume: UploadFile = File(...)):
 
 
100
  path = f"static/{resume.filename}"
101
  with open(path, 'wb') as buffer:
102
  buffer.write(resume.file.read())
@@ -106,34 +166,13 @@ def get_resume(request: Request, resume: UploadFile = File(...)):
106
  text.append(para.text)
107
  resume = "\n".join(text)
108
 
109
- def clean_my_text(text):
110
- clean_text = ' '.join(text.splitlines())
111
- clean_text = clean_text.replace('-', " ").replace("/"," ")
112
- clean_text = clean(clean_text.translate(str.maketrans('', '', string.punctuation)))
113
- return clean_text
114
-
115
- def coSkillEmbed(text):
116
- try:
117
- co = cohere.Client(os.getenv("COHERE_TOKEN"))
118
- response = co.embed(
119
- model='large',
120
- texts=[text])
121
- return response.embeddings
122
- except CohereError as e:
123
- return e
124
-
125
- def cosine(A, B):
126
- return np.dot(A,B)/(norm(A)*norm(B))
127
-
128
  # GET RESUME EMBEDDINGS AND JOB SIMILARITY SCORES
129
  embeds = coSkillEmbed(resume)
130
  simResults = []
131
-
132
  for i in range(len(simdat)):
133
  simResults.append(cosine(np.array(embeds), np.array(simdat.iloc[i,1:])))
134
  simResults = pd.DataFrame(simResults)
135
  simResults['JobTitle'] = simdat['Title']
136
-
137
  simResults = simResults.iloc[:,[1,0]]
138
  simResults.columns = ['JobTitle', 'Similarity']
139
  simResults = simResults.sort_values(by = "Similarity", ascending = False)
@@ -144,25 +183,6 @@ def get_resume(request: Request, resume: UploadFile = File(...)):
144
  simResults.iloc[x,1] = "{:0.2f}".format(simResults.iloc[x,1])
145
 
146
  # EXTRACT SKILLS FROM RESUME
147
- def skillNER(resume):
148
- resume = clean_my_text(resume)
149
- stops = set(nltk.corpus.stopwords.words('english'))
150
- stops = stops.union({'eg', 'ie', 'etc', 'experience', 'experiences', 'experienced', 'experiencing', 'knowledge',
151
- 'ability', 'abilities', 'skill', 'skills', 'skilled', 'including', 'includes', 'included', 'include'
152
- 'education', 'follow', 'following', 'follows', 'followed', 'make', 'made', 'makes', 'making', 'maker',
153
- 'available', 'large', 'larger', 'largescale', 'client', 'clients', 'responsible', 'x', 'many', 'team', 'teams'})
154
- resume = [word for word in SpaceTokenizer().tokenize(resume) if word not in stops]
155
- resume = [word for word in resume if ")" not in word]
156
- resume = [word for word in resume if "(" not in word]
157
-
158
- labels = []
159
- for i in range(len(resume)):
160
- classification = classifier(resume[i])[0]['label']
161
- if classification == 'LABEL_1':
162
- labels.append("Skill")
163
- else:
164
- labels.append("Not Skill")
165
- labels_dict = dict(zip(resume, labels))
166
- return labels_dict
167
  skills = skillNER(resume)
 
168
  return templates.TemplateResponse('find_my_match.html', context={'request': request, 'resume': resume, 'skills': skills, 'simResults': simResults})
 
1
+ # Author: Caitlin Blackmore
2
+ # Project: Pathfinder
3
+ # Project Description: This is a web application designed to facilitate job-mobility.
4
+ # It uses NLP to help job seekers find jobs that match their skills and interests.
5
+ # Date: 2023-02-03
6
+ # File Description: This is the main file, containing the FastAPI app and all the endpoints.
7
+ # License: MIT License
8
+
9
+ # IMPORTS
10
  from fastapi import FastAPI, Request, Form, File, UploadFile
11
  from fastapi.templating import Jinja2Templates
12
  from fastapi.staticfiles import StaticFiles
 
27
  import nltk
28
  from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
29
  from dotenv import load_dotenv
30
+
31
+ # LOAD ENVIRONMENT VARIABLES
32
  load_dotenv()
33
 
34
+ # SSL CERTIFICATE FIX
35
  try:
36
  _create_unverified_https_context = ssl._create_unverified_context
37
  except AttributeError:
38
  pass
39
  else:
40
  ssl._create_default_https_context = _create_unverified_https_context
41
+
42
+ # DOWNLOAD NLTK DATA IF NOT ALREADY DOWNLOADED
43
  if os.path.isdir('nltk_data')==False:
44
  nltk.download('stopwords', quiet=True)
45
 
46
+ # APP SETUP
47
  app = FastAPI()
48
  app.mount("/static", StaticFiles(directory='static'), name="static")
49
  templates = Jinja2Templates(directory="templates/")
50
 
51
+ # LOAD DATA
52
  onet = pd.read_csv('static/ONET_JobTitles.csv')
53
  simdat = pd.read_csv('static/cohere_embeddings.csv')
 
54
 
55
+ # LOAD FINE-TUNED MODEL
56
+ # (see https://huggingface.co/celise88/distilbert-base-uncased-finetuned-binary-classifier)
57
  model = AutoModelForSequenceClassification.from_pretrained('static/model_shards', low_cpu_mem_usage=True)
58
  tokenizer = AutoTokenizer.from_pretrained('static/tokenizer_shards', low_cpu_mem_usage=True)
59
  classifier = pipeline('text-classification', model = model, tokenizer = tokenizer)
60
 
61
+ # UTILITY FUNCTIONS
62
+ def clean_my_text(text):
63
+ clean_text = ' '.join(text.splitlines())
64
+ clean_text = clean_text.replace('-', " ").replace("/"," ")
65
+ clean_text = clean(clean_text.translate(str.maketrans('', '', string.punctuation)))
66
+ return clean_text
67
+
68
+ def remove_new_line(value):
69
+ return ''.join(value.splitlines())
70
+
71
+ def coSkillEmbed(text):
72
+ try:
73
+ co = cohere.Client(os.getenv("COHERE_TOKEN"))
74
+ response = co.embed(
75
+ model='large',
76
+ texts=[text])
77
+ return response.embeddings
78
+ except CohereError as e:
79
+ return e
80
+
81
+ def skillNER(resume):
82
+ resume = clean_my_text(resume)
83
+ stops = set(nltk.corpus.stopwords.words('english'))
84
+ stops = stops.union({'eg', 'ie', 'etc', 'experience', 'experiences', 'experienced', 'experiencing', 'knowledge',
85
+ 'ability', 'abilities', 'skill', 'skills', 'skilled', 'including', 'includes', 'included', 'include'
86
+ 'education', 'follow', 'following', 'follows', 'followed', 'make', 'made', 'makes', 'making', 'maker',
87
+ 'available', 'large', 'larger', 'largescale', 'client', 'clients', 'responsible', 'x', 'many', 'team', 'teams'})
88
+ resume = [word for word in SpaceTokenizer().tokenize(resume) if word not in stops]
89
+ resume = [word for word in resume if ")" not in word]
90
+ resume = [word for word in resume if "(" not in word]
91
+
92
+ labels = []
93
+ for i in range(len(resume)):
94
+ classification = classifier(resume[i])[0]['label']
95
+ if classification == 'LABEL_1':
96
+ labels.append("Skill")
97
+ else:
98
+ labels.append("Not Skill")
99
+ labels_dict = dict(zip(resume, labels))
100
+ return labels_dict
101
+
102
+ def cosine(A, B):
103
+ return np.dot(A,B)/(norm(A)*norm(B))
104
+
105
+ ### JOB INFORMATION CENTER ###
106
+ # GET
107
  @app.get("/")
108
  def render_job_list(request: Request):
109
  joblist = onet['JobTitle']
110
  return templates.TemplateResponse('job_list.html', context={'request': request, 'joblist': joblist})
111
 
112
+ # POST
113
  @app.post("/")
114
  def render_job_info(request: Request, jobtitle: str = Form(enum=[x for x in onet['JobTitle']])):
 
 
 
 
115
  joblist = onet['JobTitle']
 
116
  if jobtitle:
117
+ # SCRAPE ONET TO GET JOB DESCRIPTION, TASKS, ETC.
118
  onetCode = onet.loc[onet['JobTitle'] == jobtitle, 'onetCode']
119
  onetCode = onetCode.reindex().tolist()[0]
120
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
 
133
  tasks = remove_new_line(tasks).replace("related occupations", " ").replace("core", " - ").replace(" )importance category task", "").replace(" find ", "")
134
  tasks = tasks.split(". ")
135
  tasks = [''.join(map(lambda c: '' if c in '0123456789-' else c, task)) for task in tasks]
136
+
137
  return templates.TemplateResponse('job_list.html', context={
138
  'request': request,
139
  'joblist': joblist,
 
141
  'jobdescription': jobdescription,
142
  'tasks': tasks})
143
 
144
+ ### JOB NEIGHBORHOODS ###
145
  @app.get("/explore-job-neighborhoods/", response_class=HTMLResponse)
146
  def render_job_neighborhoods(request: Request):
147
  return templates.TemplateResponse('job_neighborhoods.html', context={'request': request})
148
 
149
+ ### FIND-MY-MATCH ###
150
+ # GET
151
  @app.get("/find-my-match/", response_class=HTMLResponse)
152
+ def match_page(request: Request):
153
  return templates.TemplateResponse('find_my_match.html', context={'request': request})
154
 
155
+ # POST
156
  @app.post('/find-my-match/', response_class=HTMLResponse)
157
+ async def get_resume(request: Request, resume: UploadFile = File(...)):
158
+
159
+ # READ AND PERFORM BASIC CLEANING ON RESUME
160
  path = f"static/{resume.filename}"
161
  with open(path, 'wb') as buffer:
162
  buffer.write(resume.file.read())
 
166
  text.append(para.text)
167
  resume = "\n".join(text)
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  # GET RESUME EMBEDDINGS AND JOB SIMILARITY SCORES
170
  embeds = coSkillEmbed(resume)
171
  simResults = []
 
172
  for i in range(len(simdat)):
173
  simResults.append(cosine(np.array(embeds), np.array(simdat.iloc[i,1:])))
174
  simResults = pd.DataFrame(simResults)
175
  simResults['JobTitle'] = simdat['Title']
 
176
  simResults = simResults.iloc[:,[1,0]]
177
  simResults.columns = ['JobTitle', 'Similarity']
178
  simResults = simResults.sort_values(by = "Similarity", ascending = False)
 
183
  simResults.iloc[x,1] = "{:0.2f}".format(simResults.iloc[x,1])
184
 
185
  # EXTRACT SKILLS FROM RESUME
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  skills = skillNER(resume)
187
+
188
  return templates.TemplateResponse('find_my_match.html', context={'request': request, 'resume': resume, 'skills': skills, 'simResults': simResults})
static/cohere_tSNE_dat.csv DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac3dbbea21867638654b3c399b988ca95c5573cc602383d8835cffe36952a7cb
3
- size 1858107