import gradio as gr import pandas as pd import spacy from spacy.pipeline import EntityRuler from spacy.lang.en import English from spacy.tokens import Doc from spacy import displacy import plotly.express as px import numpy as np import re import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer nltk.download(['stopwords','wordnet']) nltk.download('omw-1.4') # Load the CSV file into a DataFrame dataset_path = "Resume.csv" df = pd.read_csv(dataset_path) df= df.reindex(np.random.permutation(df.index)) data = df.copy().iloc[0:200,] # Load the spaCy English language model with large vocabulary and pre-trained word vectors nlp = spacy.load("en_core_web_lg") # Path to the file containing skill patterns in JSONL format skill_pattern_path = "jz_skill_patterns.jsonl" # Add an entity ruler to the spaCy pipeline ruler = nlp.add_pipe("entity_ruler") # Load skill patterns from disk into the entity ruler ruler.from_disk(skill_pattern_path) def get_unique_skills(text): doc = nlp(text) skills = set() for ent in doc.ents: if ent.label_ == "SKILL": skills.add(ent.text) return list(skills) def preprocess_resume(resume_str): # Remove special characters, URLs, and Twitter mentions review = re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"', " ", resume_str) # Convert to lowercase and tokenize review = review.lower().split() # Lemmatize and remove stopwords lm = WordNetLemmatizer() review = [lm.lemmatize(word) for word in review if word not in set(stopwords.words("english"))] # Join the words back into a string review = " ".join(review) return review # Apply the preprocess_resume function to each resume string and store the result in a new column data["Clean_Resume"] = data["Resume_str"].apply(preprocess_resume) # Extract skills from each preprocessed resume and store them in a new column data["skills"] = data["Clean_Resume"].str.lower().apply(get_unique_skills) print(data) Job_cat = data["Category"].unique() Job_cat = np.append(Job_cat, "ALL") Job_Category = "INFORMATION-TECHNOLOGY" def get_skills_distribution(Job_Category): if Job_Category != "ALL": filtered_data = data[data["Category"] == Job_Category]["skills"] else: filtered_data = data["skills"] total_skills = [skill for sublist in filtered_data for skill in sublist] fig = px.histogram( x=total_skills, labels={"x": "Skills"}, title=f"{Job_Category} Distribution of Skills", ).update_xaxes(categoryorder="total descending") return fig.show() get_skills_distribution(Job_Category) # Apply the preprocess_resume function to each resume string and store the result in a new column data["Clean_Resume"] = data["Resume_str"].apply(preprocess_resume) # Extract skills from each preprocessed resume and store them in a new column data["skills"] = data["Clean_Resume"].str.lower().apply(get_unique_skills) patterns = data.Category.unique() for a in patterns: ruler.add_patterns([{"label": "Job-Category", "pattern": a}]) # Load the spaCy model nlp = spacy.load("en_core_web_sm") # Define the styles and options for highlighting entities colors = { "Job-Category": "linear-gradient(90deg, #aa9cfc, #fc9ce7)", "SKILL": "linear-gradient(90deg, #9BE15D, #00E3AE)", "ORG": "#ffd966", "PERSON": "#e06666", "GPE": "#9fc5e8", "DATE": "#c27ba0", "ORDINAL": "#674ea7", "PRODUCT": "#f9cb9c", } options = { "ents": [ "Job-Category", "SKILL", "ORG", "PERSON", "GPE", "DATE", "ORDINAL", "PRODUCT", ], "colors": colors, } # Define a function to process the resume text and highlight entities def highlight_entities(resume_text): # Process the resume text with spaCy doc = nlp(resume_text) # Render the entities with displacy and return the HTML html = displacy.render(doc, style="ent", options=options, jupyter=False) return html # Create the Gradio interface iface = gr.Interface( fn=highlight_entities, inputs=gr.Textbox(lines=10, label="Input Resume Text"), outputs=gr.HTML(label="Highlighted Entities"), title="Resume Entity Highlighter", description="Enter your resume text and see entities highlighted.", theme="compact" ) # Launch the interface iface.launch()