diff --git a/.github/workflows/update_space.yml b/.github/workflows/update_space.yml new file mode 100644 index 0000000000000000000000000000000000000000..f34cc087d3d9e695c9828588148dd36d44484052 --- /dev/null +++ b/.github/workflows/update_space.yml @@ -0,0 +1,28 @@ +name: Run Python script + +on: + push: + branches: + - i + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.9' + + - name: Install Gradio + run: python -m pip install gradio + + - name: Log in to Hugging Face + run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")' + + - name: Deploy to Spaces + run: gradio deploy diff --git a/README.MD b/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..28dce5a6541cf96b000a0d8975f9981751abb8ea --- /dev/null +++ b/README.MD @@ -0,0 +1,54 @@ +# Web browser using python + +## Description + +Demo app for Resume classification + + +## Usage + +### Step 1: Create a Virtual Python Environment + +First, create a new virtual environment to ensure all dependencies are isolated from your main Python installation. + +```bash +conda create -n resume-atlas python=3.8 +``` + +### Step 2: Activate the Environment + +```bash +conda activate resume-atlas +``` + +### Step 3: Install the Requirements + +```bash +pip install -r requirements.txt +``` + +### step 4: install nltk package +```bash +python modules/install_stop_words.py +``` + +### step 5: Run the app +```bash +python app.py +``` +copy the following URL to your browser +```bash +http://127.0.0.1:7860 +``` + +### DEMO +![Alt text](./images/demo.png) + +### Notes: + +- if their is any problem in torch installation please use the following [website](https://pytorch.org/get-started/locally/) for installation + + + +## Contribution +Contributions are welcome! If you find any issues or have suggestions for improvement, please create an issue or submit a pull request on the project's GitHub repository.# Intelligent-Resume-Classification-and-Job-Matching-System diff --git a/README.MD:Zone.Identifier b/README.MD:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/README.md b/README.md index 17d2d99d84132a8331dbbcc3ac612158e994612c..cd3625cddbab34900fa8ea79628de4c1a5410a37 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,6 @@ --- -title: Classify -emoji: πŸ“Š -colorFrom: red -colorTo: green -sdk: gradio -sdk_version: 5.4.0 +title: classify app_file: app.py -pinned: false +sdk: gradio +sdk_version: 3.35.2 --- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..567f390fc673f328f712c909bc297d7a17a774b7 --- /dev/null +++ b/app.py @@ -0,0 +1,45 @@ +import gradio as gr +from modules.parse_pdf import process_pdf +from modules.classify import classify_text_multi # Importing BERT model classification +from modules.RandomForest import classify_text_rf,classify_text_rf_multi #Importing single and multi-label classification +from modules.SVM import classify_text_svm,classify_text_svm_multi #Importing single and multi-label classification + +# Function to process and classify PDF using both BERT and Random Forest models +def process_and_classify_pdf(file): + # Step 1: Process the PDF to extract and clean the text + parsed_text = process_pdf(file) + + # Step 2: Classify using the existing BERT model + classification_bert = classify_text_multi(parsed_text) # Assuming this is multi-label BERT model + + # Step 3: Classify using Random Forest single-label and multi-label + classification_rf_single = classify_text_rf(parsed_text) + classification_rf_multi = classify_text_rf_multi(parsed_text) + classification_svm_single=classify_text_svm(parsed_text) + classification_svm_multi=classify_text_svm_multi(parsed_text) + + # Combine the results + combined_result = ( + f"BERT Classification: {', '.join(classification_bert)}\n" + f"Random Forest (Single-label): {classification_rf_single}\n" + f"Random Forest (Multi-label): {', '.join(classification_rf_multi)}\n" + f"SVM (Single-label):{classification_svm_single}\n" + f"SVM (multi-label):{', '.join(classification_svm_multi)}" + ) + + # Step 4: Return parsed text and combined classification results + return parsed_text, combined_result + +# Define Gradio interface +input_file = gr.File(label="Upload PDF") +output_text = gr.Textbox(label="Parsed Text") +output_class = gr.Textbox(label="Job Title Predictions") + +# Launch Gradio interface +gr.Interface( + fn=process_and_classify_pdf, + inputs=input_file, + outputs=[output_text, output_class], + title="Resume Classification and Parsing for Intelligent Applicant Screening", + theme=gr.themes.Soft() +).launch(share=True) diff --git a/app.py:Zone.Identifier b/app.py:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/flagged/Upload PDF/25c86abfddeab2eb868a/Will_smith_web_developer.pdf b/flagged/Upload PDF/25c86abfddeab2eb868a/Will_smith_web_developer.pdf new file mode 100644 index 0000000000000000000000000000000000000000..62fd4e377c13ea8e7e3a4bcc6dcc7f90ab53eeae Binary files /dev/null and b/flagged/Upload PDF/25c86abfddeab2eb868a/Will_smith_web_developer.pdf differ diff --git a/flagged/Upload PDF/25c86abfddeab2eb868a/Will_smith_web_developer.pdf:Zone.Identifier b/flagged/Upload PDF/25c86abfddeab2eb868a/Will_smith_web_developer.pdf:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/flagged/Upload PDF/f3f1fafbe5840bb7f01e/Niharika_ResumeOct.docx 2.pdf b/flagged/Upload PDF/f3f1fafbe5840bb7f01e/Niharika_ResumeOct.docx 2.pdf new file mode 100644 index 0000000000000000000000000000000000000000..036259fd06e34719a81124918f49872604188a20 Binary files /dev/null and b/flagged/Upload PDF/f3f1fafbe5840bb7f01e/Niharika_ResumeOct.docx 2.pdf differ diff --git a/flagged/Upload PDF/f3f1fafbe5840bb7f01e/Niharika_ResumeOct.docx 2.pdf:Zone.Identifier b/flagged/Upload PDF/f3f1fafbe5840bb7f01e/Niharika_ResumeOct.docx 2.pdf:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/flagged/log.csv b/flagged/log.csv new file mode 100644 index 0000000000000000000000000000000000000000..10f4b0c0040f9e819c28449f897c5474ac2fd203 --- /dev/null +++ b/flagged/log.csv @@ -0,0 +1,7 @@ +Upload PDF,Parsed Text,Job Title Predictions,flag,username,timestamp +flagged\Upload PDF\25c86abfddeab2eb868a\Will_smith_web_developer.pdf,name smith address react street techville codetown cl phone email willsmithemailcom linkedin linkedincominwillsmith objective dynamic resultsdriven web developer focus frontend development using reactjs dedicated creating interactive engaging user interface seeking opportunity leverage expertise react related technology contribute innovative web project professional experience frontend web developer reactjs tech solution plus techville codetown march present developed responsive interactive web application using reactjs redux frontend technology collaborated uiux designer translate design concept functional react component implemented state management data flow pattern using redux complex web application conducted code review provided feedback ensure adherence best practice coding standard integrated thirdparty apis library enhance functionality feature web application optimized web application performance user experience efficient react component rendering junior web developer codecrafters inc code city techland july february assisted development frontend component feature web application using html cs javascript supported implementation responsive design principle technique ensure cross device compatibility participated agile development sprint contributed planning execution project task conducted testing debugging identify resolve issue web application functionality performance maintained code repository documentation ongoing project codebase management education bachelor science computer science tech university tech city techland graduated may skill proficient reactjs redux html cs javascript experience frontend build tool webpack babel familiarity uiux design principle responsive web design technique strong problemsolving analytical ability excellent communication collaboration skill certification reactjs developer certification udemy redux fundamental certification pluralsight,"BERT Classification: React Developer +Random Forest (Single-label): Web Designing +Random Forest (Multi-label): Web Designing, Java Developer, React Developer",,,2024-10-17 16:30:57.077916 +flagged\Upload PDF\f3f1fafbe5840bb7f01e\Niharika_ResumeOct.docx 2.pdf,anumola niharika varma niharikaanumolagmailcom linkedincominanumolaniharikavarmaa finalyear student specializing artificial intelligence machine learning university college engineering osmania university passionate applying aiml technique solve realworld problem handson experience building predictive model natural language processing eager contribute skill innovative project continue expanding expertise aiml research development technical skill programming language pythoncjavasql speciality exploratory data analysisedadata structure algorithmsdsadata miningdatabase management systemsdbmscomputer networksmachine learningnatural language processing library tool pandasmatplotlib seabornscikitlearnnumpyqiskitgit version control web development htmlcssreactjs project intelligent resume classification job matching system developed machine learning model using random forest bert intelligent classification resume based job category implemented multilabel classification system using random forest provide topn job recommendation parsed resume text built gradiobased user interface uploading pdfs parsing resume displaying classified job title parsed resume contentmanaged training evaluation model dataset resume text summarization github demonstrated text summarization using textranktfidflsatpegasusbertbased model evaluated text summarization model including pegasus leveraged machine learning technique analysis demonstrated pegasus delivers superior accuracy coherence summary compared traditional method summarization like tfidftextrank parallelizing sequential cryptography algorithm github implemented rsaaes algorithm cpugpu shors algorithm quantum backend using ibm quantum platform explored rsa aes encryption classical hardware shor’s algorithm break rsa quantum computer demonstrated traditional cryptographic algorithm like rsa vulnerable quantum algorithm emphasizing need quantumresistant encryption additional activity member robotics automation societyras ieee member gdscgoogle developer student club volunteered gamglobal alumnus meetof osmania university successfully organized technical event techtriaithlon infinity knational level technical symposium university college engineeringoucse department education university college engineeringosmania universityhyderabad – bachelor engineering artificial intelligence machine learning cgpa krishnamurthy junior collegevidyanagarhyderabad – intermediate score,"BERT Classification: Data Science, ETL Developer, Python Developer, React Developer +Random Forest (Single-label): Data Science +Random Forest (Multi-label): Data Science, Advocate, Information Technology",,,2024-10-23 10:59:34.952562 diff --git a/flagged/log.csv:Zone.Identifier b/flagged/log.csv:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/images/demo.png b/images/demo.png new file mode 100644 index 0000000000000000000000000000000000000000..dc97f9027b54eb24cb0dd8a3a8550fd3205748d6 Binary files /dev/null and b/images/demo.png differ diff --git a/images/demo.png:Zone.Identifier b/images/demo.png:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/label_encoder.pkl b/label_encoder.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ebb7add8a4c2c39f15d4c7d6232da59ca86139d7 --- /dev/null +++ b/label_encoder.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bc4c7cb8492f33077bd8ba17d5b38c61731bf5d7ddf3b941b6e0ffa2d369669 +size 1343 diff --git a/label_encoder.pkl:Zone.Identifier b/label_encoder.pkl:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/RandomForest.py b/modules/RandomForest.py new file mode 100644 index 0000000000000000000000000000000000000000..dcf0c0679503952ea032cf8a9a2b4cc045a27e3a --- /dev/null +++ b/modules/RandomForest.py @@ -0,0 +1,81 @@ +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from sklearn.preprocessing import LabelEncoder +from datasets import load_dataset +import joblib +import os +import numpy as np + +# Define paths for the Random Forest model, TF-IDF vectorizer, and label encoder +rf_model_path = 'random_forest_model.pkl' +vectorizer_path = "tfidf_vectorizer.pkl" +label_encoder_path = "label_encoder.pkl" +multi_rf_model_path= "random_forest_multi_model.pkl" + +# Check if models and encoder exist +if os.path.exists(rf_model_path) and os.path.exists(vectorizer_path) and os.path.exists(label_encoder_path) and os.path.exists(multi_rf_model_path): + # Load the models if they already exist + rf_single = joblib.load(rf_model_path) + vectorizer = joblib.load(vectorizer_path) + le = joblib.load(label_encoder_path) + rf_multi = joblib.load(multi_rf_model_path) + print("Random Forest model, vectorizer, and label encoder loaded from disk.") +else: + # Load the dataset + ds = load_dataset('ahmedheakl/resume-atlas', cache_dir="C:/Users/dell/.cache/huggingface/datasets") + + # Create a DataFrame from the 'train' split + df_train = pd.DataFrame(ds['train']) + + # Initialize the Label Encoder and encode the 'Category' labels + le = LabelEncoder() + df_train['Category_encoded'] = le.fit_transform(df_train['Category']) + + # Split the dataset into training and test sets + X_train, X_test, y_train, y_test = train_test_split( + df_train['Text'], df_train['Category_encoded'], test_size=0.2, random_state=42) + + # Initialize TF-IDF Vectorizer and transform the text data + vectorizer = TfidfVectorizer(max_features=1000) + X_train_tfidf = vectorizer.fit_transform(X_train) + X_test_tfidf = vectorizer.transform(X_test) + + # Initialize and train the Random Forest models + rf_single = RandomForestClassifier(n_estimators=100, random_state=42) + rf_single.fit(X_train_tfidf, y_train) + + rf_multi = RandomForestClassifier(n_estimators=100, random_state=42) + rf_multi.fit(X_train_tfidf, y_train) + + # Save the Random Forest models, TF-IDF vectorizer, and label encoder + joblib.dump(rf_single, rf_model_path) + joblib.dump(rf_multi, multi_rf_model_path) + joblib.dump(vectorizer, vectorizer_path) + joblib.dump(le, label_encoder_path) + print("Random Forest model, vectorizer, and label encoder trained and saved to disk.") + +# Single-label classification function for Random Forest model +def classify_text_rf(text): + try: + text_tfidf = vectorizer.transform([text]) + predicted_class_index = rf_single.predict(text_tfidf)[0] + predicted_category = le.inverse_transform([predicted_class_index])[0] + return predicted_category + except Exception as e: + print(f"Error in classify_text_rf: {e}") + return None + +# Multi-label classification function with top N predictions +def classify_text_rf_multi(text, top_n=3): + try: + text_tfidf = vectorizer.transform([text]) + probabilities = rf_multi.predict_proba(text_tfidf)[0] + top_n_indices = np.argsort(probabilities)[::-1][:min(top_n, len(probabilities))] + top_n_categories = le.inverse_transform(top_n_indices) + return top_n_categories + except Exception as e: + print(f"Error in classify_text_rf_multi: {e}") + return None + diff --git a/modules/RandomForest.py:Zone.Identifier b/modules/RandomForest.py:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/SVM.py b/modules/SVM.py new file mode 100644 index 0000000000000000000000000000000000000000..88e3c3c9bc20515a042b4fa3b86a5bd651da02bd --- /dev/null +++ b/modules/SVM.py @@ -0,0 +1,66 @@ +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from sklearn.svm import SVC +from sklearn.preprocessing import LabelEncoder +import numpy as np +from datasets import load_dataset +import joblib +import os + +# Define paths for the model, vectorizer, and label encoder +svm_model_path = "svm_resume_model.pkl" +vectorizer_path = "tfidf_vectorizer.pkl" +label_encoder_path = "label_encoder.pkl" + +# Check if models exist and load them; otherwise, train and save +if os.path.exists(svm_model_path) and os.path.exists(vectorizer_path) and os.path.exists(label_encoder_path): + # Load the models if they already exist + svm_model = joblib.load(svm_model_path) + vectorizer = joblib.load(vectorizer_path) + le = joblib.load(label_encoder_path) + print("Models loaded from disk.") +else: + # Load the dataset + ds = load_dataset('ahmedheakl/resume-atlas', cache_dir="C:/Users/dell/.cache/huggingface/datasets") + + # Create a DataFrame from the 'train' split + df_train = pd.DataFrame(ds['train']) + + # Initialize the Label Encoder and encode the 'Category' labels + le = LabelEncoder() + df_train['Category_encoded'] = le.fit_transform(df_train['Category']) + + # Split the dataset into training and test sets + X_train, X_test, y_train, y_test = train_test_split( + df_train['Text'], df_train['Category_encoded'], test_size=0.2, random_state=42) + + # Initialize TF-IDF Vectorizer and transform the text data + vectorizer = TfidfVectorizer(max_features=1000) + X_train_tfidf = vectorizer.fit_transform(X_train) + X_test_tfidf = vectorizer.transform(X_test) + + # Initialize and train the SVM model + svm_model = SVC(probability=True, random_state=42) + svm_model.fit(X_train_tfidf, y_train) + + # Save the SVM model, TF-IDF vectorizer, and label encoder + joblib.dump(svm_model, svm_model_path) + joblib.dump(vectorizer, vectorizer_path) + joblib.dump(le, label_encoder_path) + print("Models trained and saved to disk.") + +# Single-label classification function +def classify_text_svm(text): + text_tfidf = vectorizer.transform([text]) + predicted_class_index = svm_model.predict(text_tfidf)[0] + predicted_category = le.inverse_transform([predicted_class_index])[0] + return predicted_category + +# Multi-label classification function (returning top N predictions based on probabilities) +def classify_text_svm_multi(text, top_n=3): + text_tfidf = vectorizer.transform([text]) + probabilities = svm_model.predict_proba(text_tfidf)[0] + top_n_indices = np.argsort(probabilities)[::-1][:top_n] # Get indices of top N predictions + top_n_categories = le.inverse_transform(top_n_indices) + return top_n_categories diff --git a/modules/SVM.py:Zone.Identifier b/modules/SVM.py:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/__init__.py b/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/__init__.py:Zone.Identifier b/modules/__init__.py:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/__pycache__/RandomForest.cpython-38.pyc b/modules/__pycache__/RandomForest.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da9198897f920ff12e07b5aa3ff2b780d79a2386 Binary files /dev/null and b/modules/__pycache__/RandomForest.cpython-38.pyc differ diff --git a/modules/__pycache__/RandomForest.cpython-38.pyc:Zone.Identifier b/modules/__pycache__/RandomForest.cpython-38.pyc:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/__pycache__/RandomForest_Multi.cpython-38.pyc b/modules/__pycache__/RandomForest_Multi.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9daccaa6b74b1a48c0f3504d749ccdc5526d0e5d Binary files /dev/null and b/modules/__pycache__/RandomForest_Multi.cpython-38.pyc differ diff --git a/modules/__pycache__/RandomForest_Multi.cpython-38.pyc:Zone.Identifier b/modules/__pycache__/RandomForest_Multi.cpython-38.pyc:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/__pycache__/SVM.cpython-38.pyc b/modules/__pycache__/SVM.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38235b9668611f7457cc86963914bce5df85c4ef Binary files /dev/null and b/modules/__pycache__/SVM.cpython-38.pyc differ diff --git a/modules/__pycache__/SVM.cpython-38.pyc:Zone.Identifier b/modules/__pycache__/SVM.cpython-38.pyc:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/__pycache__/__init__.cpython-38.pyc b/modules/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f880c7110c672c92a8953fe38bed335c4b88e4cf Binary files /dev/null and b/modules/__pycache__/__init__.cpython-38.pyc differ diff --git a/modules/__pycache__/__init__.cpython-38.pyc:Zone.Identifier b/modules/__pycache__/__init__.cpython-38.pyc:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/__pycache__/classify.cpython-38.pyc b/modules/__pycache__/classify.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c32402aea9f23cf799bd74a31a30a326602a7e51 Binary files /dev/null and b/modules/__pycache__/classify.cpython-38.pyc differ diff --git a/modules/__pycache__/classify.cpython-38.pyc:Zone.Identifier b/modules/__pycache__/classify.cpython-38.pyc:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/__pycache__/classify_text.cpython-38.pyc b/modules/__pycache__/classify_text.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e0bd56a9b9910ea2dacb0bee0e18e74fe0027ea Binary files /dev/null and b/modules/__pycache__/classify_text.cpython-38.pyc differ diff --git a/modules/__pycache__/classify_text.cpython-38.pyc:Zone.Identifier b/modules/__pycache__/classify_text.cpython-38.pyc:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/__pycache__/parse_pdf.cpython-38.pyc b/modules/__pycache__/parse_pdf.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..26d623d1f746fd592909e1df8fe3ff7141188bc2 Binary files /dev/null and b/modules/__pycache__/parse_pdf.cpython-38.pyc differ diff --git a/modules/__pycache__/parse_pdf.cpython-38.pyc:Zone.Identifier b/modules/__pycache__/parse_pdf.cpython-38.pyc:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/classify.py b/modules/classify.py new file mode 100644 index 0000000000000000000000000000000000000000..880a9ff5091f014e1d5ec71c5c5e8c2914b4c5a2 --- /dev/null +++ b/modules/classify.py @@ -0,0 +1,44 @@ +from transformers import AutoTokenizer, AutoModelForSequenceClassification +import torch +import numpy as np +from sklearn import preprocessing + +# Load the Hugging Face model and tokenizer +model_name = "ahmedheakl/bert-resume-classification" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForSequenceClassification.from_pretrained(model_name) + +# Load the dataset and prepare the label encoder +dataset_id = 'ahmedheakl/resume-atlas' +from datasets import load_dataset + +# Load the dataset +ds = load_dataset(dataset_id, trust_remote_code=True) +label_column = "Category" + +# Initialize Label Encoder and fit it to the categories in the dataset +le = preprocessing.LabelEncoder() +le.fit(ds['train'][label_column]) + +def classify_text(text): + inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) + outputs = model(**inputs) + probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) + predicted_class_index = torch.argmax(probabilities).item() + + # Convert predicted class index to category name + predicted_category = le.inverse_transform([predicted_class_index])[0] + return predicted_category + +#multiclass-classification +def classify_text_multi(text, threshold=0.95): + inputs = tokenizer(text, return_tensors="pt", + truncation=True, padding=True) + outputs = model(**inputs) + probabilities = torch.nn.functional.sigmoid(outputs.logits) + predicted_classes = (probabilities > threshold).int().tolist()[0] + job_titles = [le.inverse_transform([idx])[0] for idx, val in enumerate(predicted_classes) if val == 1] + + if not job_titles: + return ["Uncertain Prediction"] + return job_titles diff --git a/modules/classify.py:Zone.Identifier b/modules/classify.py:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/classify_text.py b/modules/classify_text.py new file mode 100644 index 0000000000000000000000000000000000000000..7b596020ada819f1285440ee1764fe99bee56b7e --- /dev/null +++ b/modules/classify_text.py @@ -0,0 +1,17 @@ +from transformers import AutoTokenizer, AutoModelForSequenceClassification +import torch + +# Load the Hugging Face model and tokenizer +model_name = "ahmedheakl/bert-resume-classification" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForSequenceClassification.from_pretrained(model_name) + + +def classify_text(text): + inputs = tokenizer(text, return_tensors="pt", + truncation=True, padding=True) + outputs = model(**inputs) + probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) + predicted_class = torch.argmax(probabilities).item() + return predicted_class + diff --git a/modules/classify_text.py:Zone.Identifier b/modules/classify_text.py:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/install_stop_words.py b/modules/install_stop_words.py new file mode 100644 index 0000000000000000000000000000000000000000..65c20c2c8a9024a6b90f6acf4d3a88347bd3b2cc --- /dev/null +++ b/modules/install_stop_words.py @@ -0,0 +1,5 @@ +import nltk + +# Download NLTK stopwords and wordnet for lemmatization +nltk.download('stopwords') +nltk.download('wordnet') \ No newline at end of file diff --git a/modules/install_stop_words.py:Zone.Identifier b/modules/install_stop_words.py:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/parse_pdf.py b/modules/parse_pdf.py new file mode 100644 index 0000000000000000000000000000000000000000..7715aa09a3d9bc7fdcc958fce324ba0d21d553f1 --- /dev/null +++ b/modules/parse_pdf.py @@ -0,0 +1,61 @@ +from langchain_community.document_loaders import PyMuPDFLoader +from nltk.corpus import stopwords +from nltk.stem import PorterStemmer +from nltk.stem import WordNetLemmatizer +import re +import string + + +def load_pdf(file_path): + loader = PyMuPDFLoader(file_path) + data = loader.load() + return data + + +def clean_text(text): + # Remove special characters (customize as needed) + special_characters = "○●‒◦" + text = re.sub(f"[{re.escape(special_characters)}]", "", text) + + # Remove punctuation + text = text.translate(str.maketrans("", "", string.punctuation)) + + # Remove numbers + text = re.sub(r'\d+', '', text) + + # Remove extra whitespace + text = " ".join(text.split()) + + # Convert text to lowercase + text = text.lower() + + # Remove stopwords (optional) + stop_words = set(stopwords.words('english')) + text = " ".join(word for word in text.split() if word not in stop_words) + + # Stemming (optional) + #ps = PorterStemmer() + #text = " ".join(ps.stem(word) for word in text.split()) + + #Lemmatization + lemmatizer = WordNetLemmatizer() + text= " ".join(lemmatizer.lemmatize(word) for word in text.split()) + + return text + + +def get_full_resume_text(file_path): + resume_pages = load_pdf(file_path) + resume_text = "" + + for page in resume_pages: + resume_text += page.page_content + resume_text += "\n\n" + + resume_text = clean_text(resume_text) + + return resume_text + + +def process_pdf(file): + return get_full_resume_text(file.name) diff --git a/modules/parse_pdf.py:Zone.Identifier b/modules/parse_pdf.py:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/random_forest_model.pkl b/random_forest_model.pkl new file mode 100644 index 0000000000000000000000000000000000000000..28d67a38b12ff02c304d885253b7b1293fbf8552 --- /dev/null +++ b/random_forest_model.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2150e6cfaf1f7b7fcb65ae956f1b59c7c4c5c6c3024adbd56f79024632c3d29c +size 212428577 diff --git a/random_forest_model.pkl:Zone.Identifier b/random_forest_model.pkl:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/random_forest_multi_model.pkl b/random_forest_multi_model.pkl new file mode 100644 index 0000000000000000000000000000000000000000..28d67a38b12ff02c304d885253b7b1293fbf8552 --- /dev/null +++ b/random_forest_multi_model.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2150e6cfaf1f7b7fcb65ae956f1b59c7c4c5c6c3024adbd56f79024632c3d29c +size 212428577 diff --git a/random_forest_multi_model.pkl:Zone.Identifier b/random_forest_multi_model.pkl:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..160681c4b7624ce1a3eea8e0ec691d2e5d60f9ea --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +gradio==4.37.2 +pypdf==4.2.0 +langchain==0.2.3 +langchain-community==0.2.4 +langchain-core==0.2.5 +nltk==3.8.1 +PyMuPDF==1.24.5 +transformers==4.42.3 +torch==2.3.1 +torchvision==0.18.1 +torchaudio==2.3.1 diff --git a/requirements.txt:Zone.Identifier b/requirements.txt:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/svm_resume_model.pkl b/svm_resume_model.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1b6a84b769bc84365b4b99f6654565fa9ddb94e9 --- /dev/null +++ b/svm_resume_model.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56acdb00d22e511c6745e97050960a67b10e01648114744e85a0e1567fc2a172 +size 20199571 diff --git a/svm_resume_model.pkl:Zone.Identifier b/svm_resume_model.pkl:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tfidf_vectorizer.pkl b/tfidf_vectorizer.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d594fa50639797701230de598c1e882df19eddbd --- /dev/null +++ b/tfidf_vectorizer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d1bd4eb4dfd990f06c16ff24e7f65e865d22244e1a0add30d60ce95ebcd911f +size 1403625 diff --git a/tfidf_vectorizer.pkl:Zone.Identifier b/tfidf_vectorizer.pkl:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391