Spaces:
Sleeping
Sleeping
Update data_cleaning.py
Browse files- data_cleaning.py +8 -2
data_cleaning.py
CHANGED
@@ -10,13 +10,19 @@ def clean_data(df):
|
|
10 |
df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel'])
|
11 |
|
12 |
# Drop rows where 'Reply' column is missing
|
|
|
13 |
df.dropna(subset=['Reply'], inplace=True)
|
|
|
14 |
|
15 |
# Calculate comment and reply lengths
|
16 |
df['comment_length'] = df['Comment'].str.len()
|
17 |
df['reply_length'] = df['Reply'].str.len()
|
18 |
|
19 |
# Remove duplicate rows
|
20 |
-
|
|
|
21 |
|
22 |
-
|
|
|
|
|
|
|
|
10 |
df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel'])
|
11 |
|
12 |
# Drop rows where 'Reply' column is missing
|
13 |
+
before = df.shape[0]
|
14 |
df.dropna(subset=['Reply'], inplace=True)
|
15 |
+
after = df.shape[0]
|
16 |
|
17 |
# Calculate comment and reply lengths
|
18 |
df['comment_length'] = df['Comment'].str.len()
|
19 |
df['reply_length'] = df['Reply'].str.len()
|
20 |
|
21 |
# Remove duplicate rows
|
22 |
+
num_duplicates = df.duplicated().sum()
|
23 |
+
df_deduplicated = df.drop_duplicates()
|
24 |
|
25 |
+
# Print number of duplicates
|
26 |
+
print('Number of duplicate rows:', num_duplicates)
|
27 |
+
|
28 |
+
return df_deduplicated
|