Souha Ben Hassine
try again
497f6e8
raw
history blame
4.4 kB
import gradio as gr
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc
from spacy import displacy
import plotly.express as px
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet'])
nltk.download('omw-1.4')
# Load the CSV file into a DataFrame
dataset_path = "Resume.csv"
df = pd.read_csv(dataset_path)
df= df.reindex(np.random.permutation(df.index))
data = df.copy().iloc[0:200,]
# Load the spaCy English language model with large vocabulary and pre-trained word vectors
nlp = spacy.load("en_core_web_lg")
# Path to the file containing skill patterns in JSONL format
skill_pattern_path = "jz_skill_patterns.jsonl"
# Add an entity ruler to the spaCy pipeline
ruler = nlp.add_pipe("entity_ruler")
# Load skill patterns from disk into the entity ruler
ruler.from_disk(skill_pattern_path)
def get_unique_skills(text):
doc = nlp(text)
skills = set()
for ent in doc.ents:
if ent.label_ == "SKILL":
skills.add(ent.text)
return list(skills)
def preprocess_resume(resume_str):
# Remove special characters, URLs, and Twitter mentions
review = re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"', " ", resume_str)
# Convert to lowercase and tokenize
review = review.lower().split()
# Lemmatize and remove stopwords
lm = WordNetLemmatizer()
review = [lm.lemmatize(word) for word in review if word not in set(stopwords.words("english"))]
# Join the words back into a string
review = " ".join(review)
return review
# Apply the preprocess_resume function to each resume string and store the result in a new column
data["Clean_Resume"] = data["Resume_str"].apply(preprocess_resume)
# Extract skills from each preprocessed resume and store them in a new column
data["skills"] = data["Clean_Resume"].str.lower().apply(get_unique_skills)
print(data)
Job_cat = data["Category"].unique()
Job_cat = np.append(Job_cat, "ALL")
Job_Category = "INFORMATION-TECHNOLOGY"
def get_skills_distribution(Job_Category):
if Job_Category != "ALL":
filtered_data = data[data["Category"] == Job_Category]["skills"]
else:
filtered_data = data["skills"]
total_skills = [skill for sublist in filtered_data for skill in sublist]
fig = px.histogram(
x=total_skills,
labels={"x": "Skills"},
title=f"{Job_Category} Distribution of Skills",
).update_xaxes(categoryorder="total descending")
return fig.show()
get_skills_distribution(Job_Category)
# Apply the preprocess_resume function to each resume string and store the result in a new column
data["Clean_Resume"] = data["Resume_str"].apply(preprocess_resume)
# Extract skills from each preprocessed resume and store them in a new column
data["skills"] = data["Clean_Resume"].str.lower().apply(get_unique_skills)
patterns = data.Category.unique()
for a in patterns:
ruler.add_patterns([{"label": "Job-Category", "pattern": a}])
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")
# Define the styles and options for highlighting entities
colors = {
"Job-Category": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
"SKILL": "linear-gradient(90deg, #9BE15D, #00E3AE)",
"ORG": "#ffd966",
"PERSON": "#e06666",
"GPE": "#9fc5e8",
"DATE": "#c27ba0",
"ORDINAL": "#674ea7",
"PRODUCT": "#f9cb9c",
}
options = {
"ents": [
"Job-Category",
"SKILL",
"ORG",
"PERSON",
"GPE",
"DATE",
"ORDINAL",
"PRODUCT",
],
"colors": colors,
}
# Define a function to process the resume text and highlight entities
def highlight_entities(resume_text):
# Process the resume text with spaCy
doc = nlp(resume_text)
# Render the entities with displacy and return the HTML
html = displacy.render(doc, style="ent", options=options, jupyter=False)
return html
# Create the Gradio interface
iface = gr.Interface(
fn=highlight_entities,
inputs=gr.Textbox(lines=10, label="Input Resume Text"),
outputs=gr.HTML(label="Highlighted Entities"),
title="Resume Entity Highlighter",
description="Enter your resume text and see entities highlighted.",
theme="compact"
)
# Launch the interface
iface.launch()