bangaboy commited on
Commit
e383c8d
·
verified ·
1 Parent(s): 2d7f816

Delete resume_parser.py

Browse files
Files changed (1) hide show
  1. resume_parser.py +0 -126
resume_parser.py DELETED
@@ -1,126 +0,0 @@
1
- import fitz # PyMuPDF for PDF text extraction
2
- import re
3
- import spacy
4
- from transformers import pipeline
5
- from docx import Document
6
- import dateparser
7
- from datetime import datetime
8
- from nltk.corpus import words
9
- from models.load_models import nlp_spacy, nlp_ner
10
-
11
- # NLTK words
12
- english_words = set(words.words())
13
-
14
- # Function to refine ORG entities
15
- def refine_org_entities(entities):
16
- refined_entities = set()
17
- company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.']
18
-
19
- for entity in entities:
20
- if any(entity.endswith(suffix) for suffix in company_suffixes):
21
- refined_entities.add(entity)
22
- elif re.match(r'([A-Z][a-z]+)\s([A-Z][a-z]+)', entity):
23
- refined_entities.add(entity)
24
- return list(refined_entities)
25
-
26
- # Function to extract ORG entities using NER
27
- def extract_orgs(text):
28
- ner_results = nlp_ner(text)
29
- orgs = set()
30
- for entity in ner_results:
31
- if entity['entity_group'] == 'ORG':
32
- orgs.add(entity['word'])
33
- return refine_org_entities(orgs)
34
-
35
- # Function to extract text from PDF
36
- def extract_text_from_pdf(pdf_file):
37
- doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
38
- text = ""
39
- for page_num in range(doc.page_count):
40
- page = doc.load_page(page_num)
41
- text += page.get_text()
42
- return text
43
-
44
- # Function to extract text from DOCX
45
- def extract_text_from_doc(doc_file):
46
- doc = Document(doc_file)
47
- text = '\n'.join([para.text for para in doc.paragraphs])
48
- return text
49
-
50
- # Function to extract experience
51
- def extract_experience(doc):
52
- experience = 0
53
- for ent in doc.ents:
54
- if ent.label_ == "DATE":
55
- date = dateparser.parse(ent.text)
56
- if date:
57
- experience = max(experience, datetime.now().year - date.year)
58
- return experience
59
-
60
- # Function to extract phone numbers
61
- def extract_phone(text):
62
- phone_patterns = [
63
- r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
64
- r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
65
- ]
66
- for pattern in phone_patterns:
67
- match = re.search(pattern, text)
68
- if match:
69
- return match.group()
70
- return "Not found"
71
-
72
- # Function to extract email addresses
73
- def extract_email(text):
74
- email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
75
- match = re.search(email_pattern, text)
76
- return match.group() if match else "Not found"
77
-
78
- # Function to extract colleges
79
- def extract_colleges(doc):
80
- colleges = set()
81
- edu_keywords = ["university", "college", "institute", "school"]
82
- for ent in doc.ents:
83
- if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords):
84
- colleges.add(ent.text)
85
- return list(colleges)
86
-
87
- # Function to extract LinkedIn profile
88
- def extract_linkedin(text):
89
- linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
90
- match = re.search(linkedin_pattern, text)
91
- return match.group() if match else "Not found"
92
-
93
- # Main function to extract resume data
94
- def extract_resume_data(uploaded_file):
95
- file_ext = uploaded_file.name.split('.')[-1].lower()
96
-
97
- # Extract text based on file type
98
- if file_ext == 'pdf':
99
- resume_text = extract_text_from_pdf(uploaded_file)
100
- elif file_ext in ['docx', 'doc']:
101
- resume_text = extract_text_from_doc(uploaded_file)
102
- else:
103
- raise ValueError("Unsupported file format.")
104
-
105
- if not resume_text.strip():
106
- raise ValueError("The resume appears to be empty.")
107
-
108
- # Process the resume text using SpaCy
109
- doc = nlp_spacy(resume_text)
110
-
111
- # Extract required information
112
- companies = extract_orgs(resume_text)
113
- experience = extract_experience(doc)
114
- phone = extract_phone(resume_text)
115
- email = extract_email(resume_text)
116
- colleges = extract_colleges(doc)
117
- linkedin = extract_linkedin(resume_text)
118
-
119
- return {
120
- "Years of Experience": experience,
121
- "Companies Worked For": ", ".join(companies),
122
- "Phone Number": phone,
123
- "Email ID": email,
124
- "Colleges Attended": ", ".join(colleges),
125
- "LinkedIn ID": linkedin
126
- }, resume_text