Spaces:
Sleeping
Sleeping
Delete resume_parser.py
Browse files- resume_parser.py +0 -126
resume_parser.py
DELETED
@@ -1,126 +0,0 @@
|
|
1 |
-
import fitz # PyMuPDF for PDF text extraction
|
2 |
-
import re
|
3 |
-
import spacy
|
4 |
-
from transformers import pipeline
|
5 |
-
from docx import Document
|
6 |
-
import dateparser
|
7 |
-
from datetime import datetime
|
8 |
-
from nltk.corpus import words
|
9 |
-
from models.load_models import nlp_spacy, nlp_ner
|
10 |
-
|
11 |
-
# NLTK words
|
12 |
-
english_words = set(words.words())
|
13 |
-
|
14 |
-
# Function to refine ORG entities
|
15 |
-
def refine_org_entities(entities):
|
16 |
-
refined_entities = set()
|
17 |
-
company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.']
|
18 |
-
|
19 |
-
for entity in entities:
|
20 |
-
if any(entity.endswith(suffix) for suffix in company_suffixes):
|
21 |
-
refined_entities.add(entity)
|
22 |
-
elif re.match(r'([A-Z][a-z]+)\s([A-Z][a-z]+)', entity):
|
23 |
-
refined_entities.add(entity)
|
24 |
-
return list(refined_entities)
|
25 |
-
|
26 |
-
# Function to extract ORG entities using NER
|
27 |
-
def extract_orgs(text):
|
28 |
-
ner_results = nlp_ner(text)
|
29 |
-
orgs = set()
|
30 |
-
for entity in ner_results:
|
31 |
-
if entity['entity_group'] == 'ORG':
|
32 |
-
orgs.add(entity['word'])
|
33 |
-
return refine_org_entities(orgs)
|
34 |
-
|
35 |
-
# Function to extract text from PDF
|
36 |
-
def extract_text_from_pdf(pdf_file):
|
37 |
-
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
38 |
-
text = ""
|
39 |
-
for page_num in range(doc.page_count):
|
40 |
-
page = doc.load_page(page_num)
|
41 |
-
text += page.get_text()
|
42 |
-
return text
|
43 |
-
|
44 |
-
# Function to extract text from DOCX
|
45 |
-
def extract_text_from_doc(doc_file):
|
46 |
-
doc = Document(doc_file)
|
47 |
-
text = '\n'.join([para.text for para in doc.paragraphs])
|
48 |
-
return text
|
49 |
-
|
50 |
-
# Function to extract experience
|
51 |
-
def extract_experience(doc):
|
52 |
-
experience = 0
|
53 |
-
for ent in doc.ents:
|
54 |
-
if ent.label_ == "DATE":
|
55 |
-
date = dateparser.parse(ent.text)
|
56 |
-
if date:
|
57 |
-
experience = max(experience, datetime.now().year - date.year)
|
58 |
-
return experience
|
59 |
-
|
60 |
-
# Function to extract phone numbers
|
61 |
-
def extract_phone(text):
|
62 |
-
phone_patterns = [
|
63 |
-
r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
|
64 |
-
r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
|
65 |
-
]
|
66 |
-
for pattern in phone_patterns:
|
67 |
-
match = re.search(pattern, text)
|
68 |
-
if match:
|
69 |
-
return match.group()
|
70 |
-
return "Not found"
|
71 |
-
|
72 |
-
# Function to extract email addresses
|
73 |
-
def extract_email(text):
|
74 |
-
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
75 |
-
match = re.search(email_pattern, text)
|
76 |
-
return match.group() if match else "Not found"
|
77 |
-
|
78 |
-
# Function to extract colleges
|
79 |
-
def extract_colleges(doc):
|
80 |
-
colleges = set()
|
81 |
-
edu_keywords = ["university", "college", "institute", "school"]
|
82 |
-
for ent in doc.ents:
|
83 |
-
if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords):
|
84 |
-
colleges.add(ent.text)
|
85 |
-
return list(colleges)
|
86 |
-
|
87 |
-
# Function to extract LinkedIn profile
|
88 |
-
def extract_linkedin(text):
|
89 |
-
linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
|
90 |
-
match = re.search(linkedin_pattern, text)
|
91 |
-
return match.group() if match else "Not found"
|
92 |
-
|
93 |
-
# Main function to extract resume data
|
94 |
-
def extract_resume_data(uploaded_file):
|
95 |
-
file_ext = uploaded_file.name.split('.')[-1].lower()
|
96 |
-
|
97 |
-
# Extract text based on file type
|
98 |
-
if file_ext == 'pdf':
|
99 |
-
resume_text = extract_text_from_pdf(uploaded_file)
|
100 |
-
elif file_ext in ['docx', 'doc']:
|
101 |
-
resume_text = extract_text_from_doc(uploaded_file)
|
102 |
-
else:
|
103 |
-
raise ValueError("Unsupported file format.")
|
104 |
-
|
105 |
-
if not resume_text.strip():
|
106 |
-
raise ValueError("The resume appears to be empty.")
|
107 |
-
|
108 |
-
# Process the resume text using SpaCy
|
109 |
-
doc = nlp_spacy(resume_text)
|
110 |
-
|
111 |
-
# Extract required information
|
112 |
-
companies = extract_orgs(resume_text)
|
113 |
-
experience = extract_experience(doc)
|
114 |
-
phone = extract_phone(resume_text)
|
115 |
-
email = extract_email(resume_text)
|
116 |
-
colleges = extract_colleges(doc)
|
117 |
-
linkedin = extract_linkedin(resume_text)
|
118 |
-
|
119 |
-
return {
|
120 |
-
"Years of Experience": experience,
|
121 |
-
"Companies Worked For": ", ".join(companies),
|
122 |
-
"Phone Number": phone,
|
123 |
-
"Email ID": email,
|
124 |
-
"Colleges Attended": ", ".join(colleges),
|
125 |
-
"LinkedIn ID": linkedin
|
126 |
-
}, resume_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|