[email protected] commited on
Commit
8ebacce
·
1 Parent(s): 6fc758e

added_model

Browse files
Files changed (1) hide show
  1. train.py +107 -77
train.py CHANGED
@@ -1,80 +1,110 @@
1
  from datasets import load_dataset, Dataset
2
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
3
  from sklearn.model_selection import train_test_split
4
- import torch
5
-
6
- # Step 1: Load Dataset
7
- dataset = load_dataset("ealvaradob/phishing-dataset", "combined_reduced", trust_remote_code=True)
8
-
9
- # Step 2: Convert to Pandas and Split
10
- df = dataset['train'].to_pandas()
11
- train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
12
-
13
- # Step 3: Convert Back to Hugging Face Dataset
14
- train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
15
- test_dataset = Dataset.from_pandas(test_df, preserve_index=False)
16
-
17
- # Step 4: Tokenizer Initialization
18
- tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
19
-
20
- # Step 5: Preprocess Function
21
- def preprocess_data(examples):
22
- # Use the correct column name for the text data
23
- return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
24
-
25
- # Step 6: Tokenize the Dataset
26
- tokenized_train = train_dataset.map(preprocess_data, batched=True)
27
- tokenized_test = test_dataset.map(preprocess_data, batched=True)
28
-
29
- # Remove unused columns and set format for PyTorch
30
- tokenized_train = tokenized_train.remove_columns(['text'])
31
- tokenized_test = tokenized_test.remove_columns(['text'])
32
- tokenized_train.set_format("torch")
33
- tokenized_test.set_format("torch")
34
-
35
- # Step 7: Model Initialization
36
- model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=2)
37
-
38
- # Step 8: Training Arguments
39
- training_args = TrainingArguments(
40
- evaluation_strategy="epoch",
41
- learning_rate=2e-5,
42
- per_device_train_batch_size=16,
43
- per_device_eval_batch_size=16,
44
- num_train_epochs=3,
45
- weight_decay=0.01,
46
- save_strategy="epoch",
47
- logging_steps=10,
48
- )
49
-
50
- # Step 9: Trainer Setup
51
- trainer = Trainer(
52
- model=model,
53
- args=training_args,
54
- train_dataset=tokenized_train,
55
- eval_dataset=tokenized_test,
56
  )
57
-
58
- # Step 10: Train the Model
59
- trainer.train()
60
-
61
- # Step 11: Save the Model
62
- model.save_pretrained("./phishing_model")
63
- tokenizer.save_pretrained("./phishing_model")
64
-
65
- # Step 12: Inference Example
66
- # Load the saved model for inference
67
- loaded_tokenizer = AutoTokenizer.from_pretrained("./phishing_model")
68
- loaded_model = AutoModelForSequenceClassification.from_pretrained("./phishing_model")
69
-
70
- # Example input
71
- text = "Your account has been compromised, please reset your password now!"
72
- inputs = loaded_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
73
-
74
- # Run inference
75
- loaded_model.eval()
76
- with torch.no_grad():
77
- outputs = loaded_model(**inputs)
78
- prediction = torch.argmax(outputs.logits, dim=-1).item()
79
-
80
- print(f"Predicted label: {prediction}") # 0 = non-phishing, 1 = phishing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from datasets import load_dataset, Dataset
 
2
  from sklearn.model_selection import train_test_split
3
+ from transformers import (
4
+ BertTokenizer,
5
+ AutoModelForSequenceClassification,
6
+ Trainer,
7
+ TrainingArguments
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  )
9
+ import torch
10
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
11
+ import numpy as np
12
+
13
+
14
+ def compute_metrics(eval_pred):
15
+ logits, labels = eval_pred
16
+ preds = np.argmax(logits, axis=-1)
17
+ precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
18
+ acc = accuracy_score(labels, preds)
19
+ return {
20
+ 'accuracy': acc,
21
+ 'f1': f1,
22
+ 'precision': precision,
23
+ 'recall': recall
24
+ }
25
+
26
+
27
+ def main():
28
+ # Check for GPU availability
29
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
30
+ print(f"Using device: {device}")
31
+
32
+ # Load and prepare dataset
33
+ print("Loading dataset...")
34
+ dataset = load_dataset("ealvaradob/phishing-dataset", "combined_reduced", trust_remote_code=True)
35
+ df = dataset['train'].to_pandas()
36
+
37
+ # Split dataset
38
+ train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
39
+ train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
40
+ test_dataset = Dataset.from_pandas(test_df, preserve_index=False)
41
+
42
+ # Initialize tokenizer and model
43
+ print("Initializing model...")
44
+ tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
45
+ model = AutoModelForSequenceClassification.from_pretrained(
46
+ 'bert-large-uncased',
47
+ num_labels=2
48
+ ).to(device)
49
+
50
+ def tokenize_function(examples):
51
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
52
+
53
+ # Tokenize datasets
54
+ print("Tokenizing datasets...")
55
+ train_dataset = train_dataset.map(tokenize_function, batched=True)
56
+ test_dataset = test_dataset.map(tokenize_function, batched=True)
57
+
58
+ # Convert to PyTorch datasets
59
+ train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
60
+ test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
61
+
62
+ # Set up training arguments
63
+ epochs = 3
64
+ batch_size = 64
65
+ training_args = TrainingArguments(
66
+ output_dir="./results",
67
+ evaluation_strategy="epoch",
68
+ save_strategy="epoch",
69
+ learning_rate=5e-5,
70
+ per_device_train_batch_size=batch_size,
71
+ per_device_eval_batch_size=batch_size,
72
+ num_train_epochs=epochs,
73
+ weight_decay=0.01,
74
+ logging_dir='./logs',
75
+ logging_steps=50,
76
+ load_best_model_at_end=True,
77
+ metric_for_best_model="accuracy"
78
+ )
79
+
80
+ # Define Trainer
81
+ trainer = Trainer(
82
+ model=model,
83
+ args=training_args,
84
+ train_dataset=train_dataset,
85
+ eval_dataset=test_dataset,
86
+ tokenizer=tokenizer,
87
+ compute_metrics=compute_metrics
88
+ )
89
+
90
+ # Train model
91
+ print("Starting training...")
92
+ trainer.train()
93
+
94
+ # Evaluate the model
95
+ print("Evaluating model...")
96
+ eval_results = trainer.evaluate()
97
+ print(eval_results)
98
+
99
+ # Save the model and tokenizer
100
+ print("Saving model...")
101
+ model_path = "./phishing_model"
102
+ model.save_pretrained(model_path)
103
+ tokenizer.save_pretrained(model_path)
104
+ print(f"Model and tokenizer saved to {model_path}")
105
+
106
+ print("Training completed and model saved!")
107
+
108
+
109
+ if __name__ == "__main__":
110
+ main()