Souha Ben Hassine commited on
Commit
4bd843b
·
1 Parent(s): 497f6e8
Files changed (2) hide show
  1. README.md +9 -1
  2. app.py +61 -42
README.md CHANGED
@@ -17,4 +17,12 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
17
  pip install -r requirements.txt
18
  python3 -m spacy download en_core_web_sm
19
 
20
- ```
 
 
 
 
 
 
 
 
 
17
  pip install -r requirements.txt
18
  python3 -m spacy download en_core_web_sm
19
 
20
+ ```
21
+
22
+
23
+
24
+
25
+
26
+
27
+ input_resume = "Abid Ali Awan Data Scientist I am a certified data scientist professional, who loves building machine learning models and blogs about the latest AI technologies. I am currently testing AI Products at PEC-PITC, which later gets approved for human trials. [email protected] +923456855126 Islamabad, Pakistan abidaliawan.me WORK EXPERIENCE Data Scientist Pakistan Innovation and Testing Center - PEC 04/2021 - Present, Islamabad, Pakistan Redesigned data of engineers that were mostly scattered and unavailable. Designed dashboard and data analysis report to help higher management make better decisions. Accessibility of key information has created a new culture of making data-driven decisions. Contact: Ali Raza Asif - [email protected] Data Scientist Freelancing/Kaggle 11/2020 - Present, Islamabad, Pakistan Engineered a healthcare system. Used machine learning to detect some of the common decisions. The project has paved the way for others to use new techniques to get better results. Participated in Kaggle machine learning competitions. Learned new techniques to get a better score and finally got to 1 percent rank. Researcher / Event Organizer CREDIT 02/2017 - 07/2017, Kuala Lumpur, Malaysia Marketing for newly build research lab. Organized technical events and successfully invited the multiple company's CEO for talks. Reduced the gap between industries and educational institutes. Research on new development in the IoT sector. Created research proposal for funding. Investigated the new communication protocol for IoT devices. Contact: Dr. Tan Chye Cheah - [email protected] EDUCATION MSc in Technology Management Staffordshire University 11/2015 - 04/2017, Postgraduate with Distinction Challenges in Implementing IoT-enabled Smart cities in Malaysia. Bachelors Electrical Telecommunication Engineering COMSATS Institute of Information Technology, Islamabad 08/2010 - 01/2014, CGPA: 3.09 Networking Satellite communications Programming/ Matlab Telecommunication Engineering SKILLS Designing Leadership Media/Marketing R/Python SQL Tableau NLP Data Analysis Machine learning Deep learning Webapp/Cloud Feature Engineering Ensembling Time Series Technology Management ACHIEVEMENTS 98th Hungry Geese Simulation Competition (08/2021) 2nd in Covid-19 vaccinations around the world (07/2021) 8th in Automatic Speech Recognition in WOLOF (06/2021) Top 10 in WiDS Datathon. (03/2021) 40th / 622 in MagNet: Model the Geomagnetic Field Hosted by NOAA (02/2021) 18th in Rock, Paper, Scissors/Designing AI Agent Competition. (02/2021) PROJECTS Goodreads Profile Analysis WebApp (09/2021) Data Analysis Web Scraping XLM Interactive Visualization Contributed in orchest.io (08/2021) Testing and Debuging Technical Article Proposing new was to Improve ML pipelines World Vaccine Update System (06/2021) Used sqlite3 for database Automated system for daily update the Kaggle DB and Analysis Interactive dashboard mRNA-Vaccine-Degradation-Prediction (06/2021) Explore our dataset and then preprocessed sequence, structure, and predicted loop type features Train deep learning GRU model Trip Advisor Data Analysis/ML (04/2021) Preprocessing Data, Exploratory Data analysis, Word clouds. Feature Engineering, Text processing. BiLSTM Model for predicting rating, evaluation, model performance. Jane Street Market Prediction (03/2021) EDA, Feature Engineering, experimenting with hyperparameters. Ensembling: Resnet, NN Embeddings, TF Simple NN model. Using simple MLP pytorch model. Achievements/Tasks Achievements/Tasks Achievements/Tasks Thesis Course"
28
+ input_skills = "Data Science,Data Analysis,Database,Machine Learning,tableau"
app.py CHANGED
@@ -1,9 +1,6 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import spacy
4
- from spacy.pipeline import EntityRuler
5
- from spacy.lang.en import English
6
- from spacy.tokens import Doc
7
  from spacy import displacy
8
  import plotly.express as px
9
  import numpy as np
@@ -14,26 +11,27 @@ from nltk.stem import WordNetLemmatizer
14
 
15
  nltk.download(['stopwords','wordnet'])
16
  nltk.download('omw-1.4')
 
17
  # Load the CSV file into a DataFrame
18
  dataset_path = "Resume.csv"
19
  df = pd.read_csv(dataset_path)
20
  df= df.reindex(np.random.permutation(df.index))
21
- data = df.copy().iloc[0:200,]
22
 
23
  # Load the spaCy English language model with large vocabulary and pre-trained word vectors
24
- nlp = spacy.load("en_core_web_lg")
25
 
26
- # Path to the file containing skill patterns in JSONL format
27
  skill_pattern_path = "jz_skill_patterns.jsonl"
28
 
29
  # Add an entity ruler to the spaCy pipeline
30
- ruler = nlp.add_pipe("entity_ruler")
31
 
32
  # Load skill patterns from disk into the entity ruler
33
  ruler.from_disk(skill_pattern_path)
34
 
35
  def get_unique_skills(text):
36
- doc = nlp(text)
37
  skills = set()
38
  for ent in doc.ents:
39
  if ent.label_ == "SKILL":
@@ -61,12 +59,6 @@ data["Clean_Resume"] = data["Resume_str"].apply(preprocess_resume)
61
  # Extract skills from each preprocessed resume and store them in a new column
62
  data["skills"] = data["Clean_Resume"].str.lower().apply(get_unique_skills)
63
 
64
- print(data)
65
-
66
- Job_cat = data["Category"].unique()
67
- Job_cat = np.append(Job_cat, "ALL")
68
- Job_Category = "INFORMATION-TECHNOLOGY"
69
-
70
  def get_skills_distribution(Job_Category):
71
  if Job_Category != "ALL":
72
  filtered_data = data[data["Category"] == Job_Category]["skills"]
@@ -83,7 +75,6 @@ def get_skills_distribution(Job_Category):
83
 
84
  return fig.show()
85
 
86
- get_skills_distribution(Job_Category)
87
 
88
  # Apply the preprocess_resume function to each resume string and store the result in a new column
89
  data["Clean_Resume"] = data["Resume_str"].apply(preprocess_resume)
@@ -96,20 +87,8 @@ for a in patterns:
96
  ruler.add_patterns([{"label": "Job-Category", "pattern": a}])
97
 
98
 
99
- # Load the spaCy model
100
- nlp = spacy.load("en_core_web_sm")
101
 
102
- # Define the styles and options for highlighting entities
103
- colors = {
104
- "Job-Category": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
105
- "SKILL": "linear-gradient(90deg, #9BE15D, #00E3AE)",
106
- "ORG": "#ffd966",
107
- "PERSON": "#e06666",
108
- "GPE": "#9fc5e8",
109
- "DATE": "#c27ba0",
110
- "ORDINAL": "#674ea7",
111
- "PRODUCT": "#f9cb9c",
112
- }
113
  options = {
114
  "ents": [
115
  "Job-Category",
@@ -121,26 +100,66 @@ options = {
121
  "ORDINAL",
122
  "PRODUCT",
123
  ],
124
- "colors": colors,
125
  }
126
 
127
  # Define a function to process the resume text and highlight entities
128
  def highlight_entities(resume_text):
129
  # Process the resume text with spaCy
130
- doc = nlp(resume_text)
131
  # Render the entities with displacy and return the HTML
132
  html = displacy.render(doc, style="ent", options=options, jupyter=False)
133
  return html
134
 
135
- # Create the Gradio interface
136
- iface = gr.Interface(
137
- fn=highlight_entities,
138
- inputs=gr.Textbox(lines=10, label="Input Resume Text"),
139
- outputs=gr.HTML(label="Highlighted Entities"),
140
- title="Resume Entity Highlighter",
141
- description="Enter your resume text and see entities highlighted.",
142
- theme="compact"
143
- )
144
-
145
- # Launch the interface
146
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import spacy
 
 
 
4
  from spacy import displacy
5
  import plotly.express as px
6
  import numpy as np
 
11
 
12
  nltk.download(['stopwords','wordnet'])
13
  nltk.download('omw-1.4')
14
+
15
  # Load the CSV file into a DataFrame
16
  dataset_path = "Resume.csv"
17
  df = pd.read_csv(dataset_path)
18
  df= df.reindex(np.random.permutation(df.index))
19
+ data = df.copy().iloc[0:500,]
20
 
21
  # Load the spaCy English language model with large vocabulary and pre-trained word vectors
22
+ spacy_model = spacy.load("en_core_web_lg")
23
 
24
+ # Path to the file containing skill patterns in JSONL format (2129 skills)
25
  skill_pattern_path = "jz_skill_patterns.jsonl"
26
 
27
  # Add an entity ruler to the spaCy pipeline
28
+ ruler = spacy_model.add_pipe("entity_ruler")
29
 
30
  # Load skill patterns from disk into the entity ruler
31
  ruler.from_disk(skill_pattern_path)
32
 
33
  def get_unique_skills(text):
34
+ doc = spacy_model(text)
35
  skills = set()
36
  for ent in doc.ents:
37
  if ent.label_ == "SKILL":
 
59
  # Extract skills from each preprocessed resume and store them in a new column
60
  data["skills"] = data["Clean_Resume"].str.lower().apply(get_unique_skills)
61
 
 
 
 
 
 
 
62
  def get_skills_distribution(Job_Category):
63
  if Job_Category != "ALL":
64
  filtered_data = data[data["Category"] == Job_Category]["skills"]
 
75
 
76
  return fig.show()
77
 
 
78
 
79
  # Apply the preprocess_resume function to each resume string and store the result in a new column
80
  data["Clean_Resume"] = data["Resume_str"].apply(preprocess_resume)
 
87
  ruler.add_patterns([{"label": "Job-Category", "pattern": a}])
88
 
89
 
90
+ # Define the options for highlighting entities
 
91
 
 
 
 
 
 
 
 
 
 
 
 
92
  options = {
93
  "ents": [
94
  "Job-Category",
 
100
  "ORDINAL",
101
  "PRODUCT",
102
  ],
 
103
  }
104
 
105
  # Define a function to process the resume text and highlight entities
106
  def highlight_entities(resume_text):
107
  # Process the resume text with spaCy
108
+ doc = spacy_model(resume_text)
109
  # Render the entities with displacy and return the HTML
110
  html = displacy.render(doc, style="ent", options=options, jupyter=False)
111
  return html
112
 
113
+ def calculate_semantic_similarity(required_skills, resume_skills):
114
+ """
115
+ Calculate the semantic similarity between required skills and resume skills.
116
+ """
117
+ required_skills_str = " ".join(required_skills)
118
+ resume_skills_str = " ".join(resume_skills)
119
+ required_skills_doc = spacy_model(required_skills_str)
120
+ resume_skills_doc = spacy_model(resume_skills_str)
121
+ similarity_score = required_skills_doc.similarity(resume_skills_doc)
122
+ return similarity_score
123
+
124
+ def find_matching_resumes(input_skills, n=5):
125
+ """
126
+ Find and rank the top matching resumes based on input skills.
127
+ """
128
+ req_skills = input_skills.lower().split(",")
129
+ ranked_resumes = []
130
+ for idx, row in data.iterrows():
131
+ resume_skills = row['skills']
132
+ similarity_score = calculate_semantic_similarity(req_skills, resume_skills)
133
+ ranked_resumes.append((idx, similarity_score))
134
+
135
+ # Sort resumes by similarity scores in descending order
136
+ ranked_resumes.sort(key=lambda x: x[1], reverse=True)
137
+
138
+ # Get the top N matching resumes
139
+ top_matching_resumes = ranked_resumes[:n]
140
+
141
+ # Construct output in a structured format
142
+ output = []
143
+ for resume_id, score in top_matching_resumes:
144
+ output.append(f"Similarity Score: {score}\nResume ID: {resume_id}")
145
+
146
+ return output
147
+
148
+
149
+
150
+ with gr.Blocks() as demo:
151
+ gr.Markdown("Enter your resume text and perform NER, or enter the required skills and find the top matching resumes.")
152
+ with gr.Tab("Enter your resume text and perform NER"):
153
+ text_input = gr.Textbox(lines=10, label="Input Resume Text")
154
+ text_output = gr.HTML(label="Highlighted Entities")
155
+ text_button = gr.Button("Submit")
156
+ with gr.Tab("Enter the required skills (comma-separated) and find the top matching resumes."):
157
+
158
+ text_input2 = gr.Textbox(lines=5, label="Input Required Skills (comma-separated)")
159
+ text_output2 = gr.Textbox(label="Top Matching Resumes")
160
+ text_button2 = gr.Button("Submit")
161
+
162
+ text_button.click(highlight_entities, inputs=text_input, outputs=text_output)
163
+ text_button2.click(find_matching_resumes, inputs=text_input2, outputs=text_output2)
164
+
165
+ demo.launch()