Spaces:

bangaboy
/

resume_analyser

Sleeping

App Files Files Community

bangaboy commited on Sep 17, 2024

Commit

e383c8d

verified ·

1 Parent(s): 2d7f816

Delete resume_parser.py

Browse files

Files changed (1) hide show

resume_parser.py +0 -126

resume_parser.py DELETED Viewed

@@ -1,126 +0,0 @@
-import fitz  # PyMuPDF for PDF text extraction
-import re
-import spacy
-from transformers import pipeline
-from docx import Document
-import dateparser
-from datetime import datetime
-from nltk.corpus import words
-from models.load_models import nlp_spacy, nlp_ner
-# NLTK words
-english_words = set(words.words())
-# Function to refine ORG entities
-def refine_org_entities(entities):
-    refined_entities = set()
-    company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.']
-    for entity in entities:
-        if any(entity.endswith(suffix) for suffix in company_suffixes):
-            refined_entities.add(entity)
-        elif re.match(r'([A-Z][a-z]+)\s([A-Z][a-z]+)', entity):
-            refined_entities.add(entity)
-    return list(refined_entities)
-# Function to extract ORG entities using NER
-def extract_orgs(text):
-    ner_results = nlp_ner(text)
-    orgs = set()
-    for entity in ner_results:
-        if entity['entity_group'] == 'ORG':
-            orgs.add(entity['word'])
-    return refine_org_entities(orgs)
-# Function to extract text from PDF
-def extract_text_from_pdf(pdf_file):
-    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
-    text = ""
-    for page_num in range(doc.page_count):
-        page = doc.load_page(page_num)
-        text += page.get_text()
-    return text
-# Function to extract text from DOCX
-def extract_text_from_doc(doc_file):
-    doc = Document(doc_file)
-    text = '\n'.join([para.text for para in doc.paragraphs])
-    return text
-# Function to extract experience
-def extract_experience(doc):
-    experience = 0
-    for ent in doc.ents:
-        if ent.label_ == "DATE":
-            date = dateparser.parse(ent.text)
-            if date:
-                experience = max(experience, datetime.now().year - date.year)
-    return experience
-# Function to extract phone numbers
-def extract_phone(text):
-    phone_patterns = [
-        r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
-        r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
-    ]
-    for pattern in phone_patterns:
-        match = re.search(pattern, text)
-        if match:
-            return match.group()
-    return "Not found"
-# Function to extract email addresses
-def extract_email(text):
-    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
-    match = re.search(email_pattern, text)
-    return match.group() if match else "Not found"
-# Function to extract colleges
-def extract_colleges(doc):
-    colleges = set()
-    edu_keywords = ["university", "college", "institute", "school"]
-    for ent in doc.ents:
-        if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords):
-            colleges.add(ent.text)
-    return list(colleges)
-# Function to extract LinkedIn profile
-def extract_linkedin(text):
-    linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
-    match = re.search(linkedin_pattern, text)
-    return match.group() if match else "Not found"
-# Main function to extract resume data
-def extract_resume_data(uploaded_file):
-    file_ext = uploaded_file.name.split('.')[-1].lower()
-    # Extract text based on file type
-    if file_ext == 'pdf':
-        resume_text = extract_text_from_pdf(uploaded_file)
-    elif file_ext in ['docx', 'doc']:
-        resume_text = extract_text_from_doc(uploaded_file)
-    else:
-        raise ValueError("Unsupported file format.")
-    if not resume_text.strip():
-        raise ValueError("The resume appears to be empty.")
-    # Process the resume text using SpaCy
-    doc = nlp_spacy(resume_text)
-    # Extract required information
-    companies = extract_orgs(resume_text)
-    experience = extract_experience(doc)
-    phone = extract_phone(resume_text)
-    email = extract_email(resume_text)
-    colleges = extract_colleges(doc)
-    linkedin = extract_linkedin(resume_text)
-    return {
-        "Years of Experience": experience,
-        "Companies Worked For": ", ".join(companies),
-        "Phone Number": phone,
-        "Email ID": email,
-        "Colleges Attended": ", ".join(colleges),
-        "LinkedIn ID": linkedin
-    }, resume_text