import gradio as gr | |
import pandas as pd | |
import spacy | |
from spacy.pipeline import EntityRuler | |
from spacy.lang.en import English | |
from spacy.tokens import Doc | |
from spacy import displacy | |
import as px | |
import numpy as np | |
import re | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |['stopwords','wordnet']) | |'omw-1.4') | |
# Load the CSV file into a DataFrame | |
dataset_path = "Resume.csv" | |
df = pd.read_csv(dataset_path) | |
df= df.reindex(np.random.permutation(df.index)) | |
data = df.copy().iloc[0:200,] | |
# Load the spaCy English language model with large vocabulary and pre-trained word vectors | |
nlp = spacy.load("en_core_web_lg") | |
# Path to the file containing skill patterns in JSONL format | |
skill_pattern_path = "jz_skill_patterns.jsonl" | |
# Add an entity ruler to the spaCy pipeline | |
ruler = nlp.add_pipe("entity_ruler") | |
# Load skill patterns from disk into the entity ruler | |
ruler.from_disk(skill_pattern_path) | |
def get_unique_skills(text): | |
doc = nlp(text) | |
skills = set() | |
for ent in doc.ents: | |
if ent.label_ == "SKILL": | |
skills.add(ent.text) | |
return list(skills) | |
def preprocess_resume(resume_str): | |
# Remove special characters, URLs, and Twitter mentions | |
review = re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"', " ", resume_str) | |
# Convert to lowercase and tokenize | |
review = review.lower().split() | |
# Lemmatize and remove stopwords | |
lm = WordNetLemmatizer() | |
review = [lm.lemmatize(word) for word in review if word not in set(stopwords.words("english"))] | |
# Join the words back into a string | |
review = " ".join(review) | |
return review | |
# Apply the preprocess_resume function to each resume string and store the result in a new column | |
data["Clean_Resume"] = data["Resume_str"].apply(preprocess_resume) | |
# Extract skills from each preprocessed resume and store them in a new column | |
data["skills"] = data["Clean_Resume"].str.lower().apply(get_unique_skills) | |
print(data) | |
Job_cat = data["Category"].unique() | |
Job_cat = np.append(Job_cat, "ALL") | |
def get_skills_distribution(Job_Category): | |
if Job_Category != "ALL": | |
filtered_data = data[data["Category"] == Job_Category]["skills"] | |
else: | |
filtered_data = data["skills"] | |
total_skills = [skill for sublist in filtered_data for skill in sublist] | |
fig = px.histogram( | |
x=total_skills, | |
labels={"x": "Skills"}, | |
title=f"{Job_Category} Distribution of Skills", | |
).update_xaxes(categoryorder="total descending") | |
return | |
get_skills_distribution(Job_Category) | |
# Apply the preprocess_resume function to each resume string and store the result in a new column | |
data["Clean_Resume"] = data["Resume_str"].apply(preprocess_resume) | |
# Extract skills from each preprocessed resume and store them in a new column | |
data["skills"] = data["Clean_Resume"].str.lower().apply(get_unique_skills) | |
patterns = data.Category.unique() | |
for a in patterns: | |
ruler.add_patterns([{"label": "Job-Category", "pattern": a}]) | |
# Load the spaCy model | |
nlp = spacy.load("en_core_web_sm") | |
# Define the styles and options for highlighting entities | |
colors = { | |
"Job-Category": "linear-gradient(90deg, #aa9cfc, #fc9ce7)", | |
"SKILL": "linear-gradient(90deg, #9BE15D, #00E3AE)", | |
"ORG": "#ffd966", | |
"PERSON": "#e06666", | |
"GPE": "#9fc5e8", | |
"DATE": "#c27ba0", | |
"ORDINAL": "#674ea7", | |
"PRODUCT": "#f9cb9c", | |
} | |
options = { | |
"ents": [ | |
"Job-Category", | |
"SKILL", | |
"ORG", | |
"PERSON", | |
"GPE", | |
"DATE", | |
"ORDINAL", | |
"PRODUCT", | |
], | |
"colors": colors, | |
} | |
# Define a function to process the resume text and highlight entities | |
def highlight_entities(resume_text): | |
# Process the resume text with spaCy | |
doc = nlp(resume_text) | |
# Render the entities with displacy and return the HTML | |
html = displacy.render(doc, style="ent", options=options, jupyter=False) | |
return html | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=highlight_entities, | |
inputs=gr.Textbox(lines=10, label="Input Resume Text"), | |
outputs=gr.HTML(label="Highlighted Entities"), | |
title="Resume Entity Highlighter", | |
description="Enter your resume text and see entities highlighted.", | |
theme="compact" | |
) | |
# Launch the interface | |
iface.launch() | |