Canstralian commited on
Commit
d4d48c2
·
verified ·
1 Parent(s): d181e93

Create data_preprocessing/preprocessing.py

Browse files
Files changed (1) hide show
  1. data_preprocessing/preprocessing.py +48 -0
data_preprocessing/preprocessing.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+
4
+ def load_data(file_path):
5
+ """
6
+ Load dataset from a CSV file.
7
+
8
+ Args:
9
+ file_path (str): Path to the CSV file.
10
+
11
+ Returns:
12
+ pd.DataFrame: Loaded dataset.
13
+ """
14
+ return pd.read_csv(file_path)
15
+
16
+ def preprocess_data(df):
17
+ """
18
+ Preprocess the dataset by handling missing values and encoding categorical variables.
19
+
20
+ Args:
21
+ df (pd.DataFrame): Raw dataset.
22
+
23
+ Returns:
24
+ pd.DataFrame: Preprocessed dataset.
25
+ """
26
+ # Handle missing values
27
+ df = df.dropna()
28
+
29
+ # Encode categorical variables
30
+ df = pd.get_dummies(df)
31
+
32
+ return df
33
+
34
+ def split_data(df, target_column, test_size=0.2):
35
+ """
36
+ Split the dataset into training and testing sets.
37
+
38
+ Args:
39
+ df (pd.DataFrame): Preprocessed dataset.
40
+ target_column (str): Name of the target column.
41
+ test_size (float): Proportion of the dataset to include in the test split.
42
+
43
+ Returns:
44
+ X_train, X_test, y_train, y_test: Split datasets.
45
+ """
46
+ X = df.drop(columns=[target_column])
47
+ y = df[target_column]
48
+ return train_test_split(X, y, test_size=test_size, random_state=42)